Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: min_coverage_breadth #24

Merged
merged 3 commits into from
Jul 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ jobs:
run: |
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage 10
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage-depth 10
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- gwalpha -f ./tests/test.sync -p ./tests/test.py --n-threads 2 --gwalpha-method LS
cargo run -- gwalpha -f ./tests/test.sync -p ./tests/test.py --n-threads 2 --gwalpha-method ML
cargo run -- sync2csv -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
Expand Down
48 changes: 20 additions & 28 deletions src/base/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use crate::base::*;
use ndarray::prelude::*;
use std::collections::HashSet;
use std::fs::{File, OpenOptions};
use std::io::{self, prelude::*, BufReader, BufWriter, Error, ErrorKind, SeekFrom};
use std::str;
Expand Down Expand Up @@ -267,39 +266,32 @@ impl Filter for PileupLine {
}
}

if filter_stats.keep_if_any_meets_coverage {
// At least 1 pool needs to have been covered min_coverage times
let mut met_coverage_requirement = false;
for c in &self.coverages {
if c >= &filter_stats.min_coverage {
met_coverage_requirement = true;
break;
}
}
if !met_coverage_requirement{
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let pools_covered = self.coverages.iter()
.filter(|&&c| c >= filter_stats.min_coverage_depth)
.take(min_coverage_breadth as usize)
.count();

} else {
// All the pools need to have been covered at least min_coverage times
for c in &self.coverages {
if c < &filter_stats.min_coverage {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
if pools_covered != min_coverage_breadth as usize {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
let mut covered_alleles = HashSet::new();
let mut unique_alleles = Vec::new();

for pool in &self.read_codes{
for read in pool{
covered_alleles.insert(read);
'outer: for pool in &self.read_codes {
for &read in pool {
if !unique_alleles.contains(&read) {
unique_alleles.push(read);
if unique_alleles.len() == 2 {
break 'outer;
}
}
}
}

if covered_alleles.len() < 2{
if unique_alleles.len() < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
Expand Down Expand Up @@ -591,10 +583,10 @@ mod tests {
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_if_any_meets_coverage: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
4 changes: 2 additions & 2 deletions src/base/structs_and_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ pub struct FileSyncPhen {
pub struct FilterStats {
pub remove_ns: bool,
pub remove_monoallelic: bool,
pub keep_if_any_meets_coverage: bool,
pub keep_lowercase_reference: bool,
pub max_base_error_rate: f64,
pub min_coverage: u64,
pub min_coverage_depth: u64,
pub min_coverage_breadth: f64,
pub min_allele_frequency: f64,
pub max_missingness_rate: f64,
pub pool_sizes: Vec<f64>,
Expand Down
14 changes: 10 additions & 4 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ impl Filter for LocusCounts {
sum_coverage
.iter()
.fold(sum_coverage[0], |min, &x| if x < min { x } else { min });
if min_sum_coverage < filter_stats.min_coverage as f64 {
if min_sum_coverage < filter_stats.min_coverage_depth as f64 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus
// for i in 0..self.matrix.nrows() {
// if sum_coverage[i] < filter_stats.min_coverage as f64 {
// if sum_coverage[i] < filter_stats.min_coverage_depth as f64 {
// for j in 0..self.matrix.ncols() {
// self.matrix[(i, j)] = f64::NAN as u64;
// }
Expand Down Expand Up @@ -1569,8 +1569,11 @@ mod tests {
let frequencies = *(counts.to_frequencies().unwrap());
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -1675,8 +1678,11 @@ mod tests {
// let phen = file_phen.lparse().unwrap();
// let filter_stats = FilterStats {
// remove_ns: true,
// remove_monoallelic: false,
// keep_lowercase_reference: false,
// max_base_error_rate: 0.005,
// min_coverage: 0,
// min_coverage_depth: 0,
// min_coverage_breadth: 1.0,
// min_allele_frequency: 0.0001,
// max_missingness_rate: 0.2,
// pool_sizes: phen.pool_sizes,
Expand Down
30 changes: 26 additions & 4 deletions src/base/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,29 @@ impl Filter for VcfLine {
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
// All the pools needs be have been covered at least min_coverage times
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let mut pools_covered = 0;
for i in 0..self.allele_depths.len() {
let c = &self.allele_depths[i].iter().fold(0, |sum, x| sum + x);
if c < &filter_stats.min_coverage {
if c >= &filter_stats.min_coverage_depth {
pools_covered += 1;
}
if pools_covered == min_coverage_breadth {
break;
}
}
if pools_covered != min_coverage_breadth {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
if self.alternative_alleles.len() == 0 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}

// Filter by minimum allele frequency,
//// First convert the counts per pool into frequencies
let allele_frequencies = match self.to_frequencies() {
Expand Down Expand Up @@ -509,16 +525,22 @@ mod tests {
);
let filter_stats_1 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
};
let filter_stats_2 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 10,
min_coverage_depth: 10,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
Expand Down
5 changes: 4 additions & 1 deletion src/gp/cv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,11 @@ mod tests {
let n_threads = 2;
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,11 @@ mod tests {
Array2::from_shape_vec((5, 2), vec![1, 9, 2, 8, 3, 7, 4, 6, 5, 5]).unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/gwalpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,11 @@ mod tests {
.reversed_axes();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
2 changes: 1 addition & 1 deletion src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ mod tests {
// let filter_stats = FilterStats {
// remove_ns: true,
// max_base_error_rate: 0.005,
// min_coverage: 1,
// min_coverage_depth: 1,
// min_allele_frequency: 0.005,
// pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
// };
Expand Down
7 changes: 5 additions & 2 deletions src/imputation/adaptive_ld_knn_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -610,7 +613,7 @@ mod tests {
// --phen-pool-size-col 1 \
// --phen-value-col 2 \
// --min-allele-frequency 0.0001 \
// --min-coverage 0 \
// --min-coverage-depth 0 \
// --min-quality 0.01 \
// --max-missingness-rate 0.2 \
// --min-depth-set-to-missing 5 \
Expand Down
5 changes: 4 additions & 1 deletion src/imputation/filtering_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,11 @@ mod tests {
////////////////////////////////////////////////////////////////////////////////////////////////////////
let filter_stats = FilterStats {
remove_ns: false,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 1.0,
min_coverage: 0,
min_coverage_depth: 0,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.000001,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down
7 changes: 5 additions & 2 deletions src/imputation/mean_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -263,7 +266,7 @@ mod tests {
// --phen-pool-size-col 1 \
// --phen-value-col 2 \
// --min-allele-frequency 0.0001 \
// --min-coverage 0 \
// --min-coverage-depth 0 \
// --min-quality 0.01 \
// --max-missingness-rate 0.75 \
// --min-depth-set-to-missing 5 \
Expand Down
Loading