Skip to content

Commit

Permalink
Merge pull request #25 from jeffersonfparil/dev
Browse files Browse the repository at this point in the history
feat: argument bounds checking and option filtering
  • Loading branch information
jeffersonfparil authored Jul 14, 2024
2 parents a78f40b + a8e1ada commit 0018676
Show file tree
Hide file tree
Showing 13 changed files with 103 additions and 43 deletions.
57 changes: 57 additions & 0 deletions src/base/filter_stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use std::io::{self, Error, ErrorKind};

use crate::base::*;

impl Parse<FilterStats> for FilterStats {
fn lparse(&self) -> io::Result<Box<FilterStats>> {
let remove_ns = self.remove_ns.clone();
let remove_monoallelic = self.remove_monoallelic.clone();
let keep_lowercase_reference = self.keep_lowercase_reference.clone();
let max_base_error_rate = if self.max_base_error_rate <= 1.0 && self.max_base_error_rate >= 0.0 {
self.max_base_error_rate.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. max_base_error_rate must be between 0.0 and 1.0",
));
};
let min_coverage_depth = self.min_coverage_depth;
let min_coverage_breadth = if self.min_coverage_breadth <= 1.0 && self.max_base_error_rate >= 0.0 {
self.min_coverage_breadth.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. min_coverage_breadth must be between 0.0 and 1.0",
));
};
let min_allele_frequency = if self.min_allele_frequency <= 1.0 && self.min_allele_frequency >= 0.0 {
self.min_allele_frequency.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. min_allele_frequency must be between 0.0 and 1.0",
));
};
let max_missingness_rate = if self.max_missingness_rate <= 1.0 && self.max_missingness_rate >= 0.0 {
self.max_missingness_rate.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. max_missingness_rate must be between 0.0 and 1.0",
));
};
let pool_sizes = self.pool_sizes.clone();
return Ok(Box::new(FilterStats {
remove_ns,
remove_monoallelic,
keep_lowercase_reference,
max_base_error_rate,
min_coverage_depth,
min_coverage_breadth,
min_allele_frequency,
max_missingness_rate,
pool_sizes,
}));
}
}

1 change: 1 addition & 0 deletions src/base/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ mod pileup;
mod structs_and_traits;
mod sync;
mod vcf;
mod filter_stats;
14 changes: 7 additions & 7 deletions src/base/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl Filter for PileupLine {
/// Filter `PileupLine` by:
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// First, make sure we have the correct the correct number of expected pools as the phenotype file
// TODO: Make the error pop-out because it's currently being consumed as None in the only function calling it below.
if self.coverages.len() != filter_stats.pool_sizes.len() {
Expand Down Expand Up @@ -274,7 +274,7 @@ impl Filter for PileupLine {
.count();

if pools_covered != min_coverage_breadth as usize {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
Expand All @@ -292,7 +292,7 @@ impl Filter for PileupLine {
}
}
if unique_alleles.len() < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
}

Expand Down Expand Up @@ -357,9 +357,9 @@ impl Filter for PileupLine {
}
// Filter the whole locus depending on whether or not we have retained at least 2 alleles
if m < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
Ok(self)
Ok(Some(self))
}
}

Expand All @@ -368,8 +368,8 @@ pub fn pileup_to_sync(pileup_line: &mut PileupLine, filter_stats: &FilterStats)
// let mut pileup_line: Box<PileupLine> = line.lparse().unwrap();
// Filter
match pileup_line.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Convert to counts
let locus_counts = match pileup_line.to_counts() {
Expand Down
2 changes: 1 addition & 1 deletion src/base/structs_and_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ pub trait Parse<T> {
pub trait Filter {
fn to_counts(&self) -> io::Result<Box<LocusCounts>>;
fn to_frequencies(&self) -> io::Result<Box<LocusFrequencies>>;
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self>;
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>>;
}

pub trait Sort {
Expand Down
27 changes: 14 additions & 13 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ impl Filter for LocusCounts {
}

// Filter PileupLine by minimum coverage, minimum quality
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion
// Preliminary check of the structure format
self.check().unwrap();
Expand Down Expand Up @@ -225,7 +225,7 @@ impl Filter for LocusCounts {
.iter()
.fold(sum_coverage[0], |min, &x| if x < min { x } else { min });
if min_sum_coverage < filter_stats.min_coverage_depth as f64 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus
// for i in 0..self.matrix.nrows() {
Expand Down Expand Up @@ -282,7 +282,7 @@ impl Filter for LocusCounts {
}
// Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out
if p < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all
let (n, _p) = allele_frequencies.matrix.dim();
Expand All @@ -291,15 +291,15 @@ impl Filter for LocusCounts {
.slice(s![.., 0])
.fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum });
if n_missing_across_pools == n {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold
if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Return the locus if it passed all the filtering steps
self.matrix = matrix;
Ok(self)
Ok(Some(self))
}
}

Expand Down Expand Up @@ -376,7 +376,7 @@ impl Filter for LocusFrequencies {
}

// Filter PileupLine by minimum coverage, minimum quality
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion
// Also, cannot filter by minimum coverage as that data is lost from counts to frequencies conversion
// Preliminary check of the structure format
Expand Down Expand Up @@ -453,7 +453,7 @@ impl Filter for LocusFrequencies {
}
// Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out
if p < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all
let (n, _p) = allele_frequencies.matrix.dim();
Expand All @@ -462,15 +462,15 @@ impl Filter for LocusFrequencies {
.slice(s![.., 0])
.fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum });
if n_missing_across_pools == n {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold
if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Return the locus if it passed all the filtering steps
self.matrix = matrix;
Ok(self)
Ok(Some(self))
}
}

Expand Down Expand Up @@ -1020,8 +1020,8 @@ impl LoadAll for FileSyncPhen {
},
};
match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => continue,
Ok(Some(x)) => x,
_ => continue,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => *x,
Expand Down Expand Up @@ -1584,6 +1584,7 @@ mod tests {
.clone()
.filter(&filter_stats)
.unwrap()
.unwrap()
.to_frequencies()
.unwrap());
let mut sorted_filtered_frequencies = filtered_frequencies.clone();
Expand Down
18 changes: 9 additions & 9 deletions src/base/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl Filter for VcfLine {
/// Filter `VcfLine` by:
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let mut pools_covered = 0;
Expand All @@ -128,13 +128,13 @@ impl Filter for VcfLine {
}
}
if pools_covered != min_coverage_breadth {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
if self.alternative_alleles.len() == 0 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
}

Expand Down Expand Up @@ -179,18 +179,18 @@ impl Filter for VcfLine {
}
// Filter the whole locus depending on whether or not we have retained at least 2 alleles
if m < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
Ok(self)
Ok(Some(self))
}
}

/// Convert `VcfLine` into a string representing a line or locus in a `sync` file.
pub fn vcf_to_sync(vcf_line: &mut VcfLine, filter_stats: &FilterStats) -> Option<String> {
// Filter
match vcf_line.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Convert to counts
let locus_counts = match vcf_line.to_counts() {
Expand Down Expand Up @@ -557,8 +557,8 @@ mod tests {
);
filtered_vcf_1.filter(&filter_stats_1).unwrap();
let err = match filtered_vcf_2.filter(&filter_stats_2) {
Ok(_) => 0,
Err(_) => 1,
Ok(Some(_)) => 0,
_ => 1,
};
assert_eq!(filtered_vcf_1, vcf_line);
assert_eq!(err, 1);
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ pub fn correlation(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x.clone(),
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/gwalpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ fn prepare_geno_and_pheno_stats(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/mle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ pub fn mle_iterate(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ pub fn ols_iterate(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ fn main() {
format: phen_format,
};
let phen = file_phen.lparse().unwrap();
let filter_stats = base::FilterStats {
let file_filter_stats = base::FilterStats {
remove_ns: !args.keep_ns,
remove_monoallelic: args.remove_monoallelic,
keep_lowercase_reference: args.keep_lowercase_reference,
Expand All @@ -204,6 +204,7 @@ fn main() {
max_missingness_rate: args.max_missingness_rate,
pool_sizes: phen.pool_sizes.clone(),
};
let filter_stats = file_filter_stats.lparse().unwrap();
if args.analysis == String::from("pileup2sync") {
// PILEUP INPUT
let file_pileup = base::FilePileup {
Expand Down
4 changes: 2 additions & 2 deletions src/tables/chisq_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use statrs::distribution::{ChiSquared, ContinuousCDF};

pub fn chisq(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option<String> {
let locus_counts = match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/tables/fisher_exact_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ fn hypergeom_ratio(counts: &Array2<f64>, log_prod_fac_marginal_sums: &f64) -> io

pub fn fisher(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option<String> {
let locus_counts = match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Restrict so that the sum is less than or equal to 34, i.e. at n>34 : n! > f64::MAX
let n = locus_counts.matrix.nrows();
Expand Down

0 comments on commit 0018676

Please sign in to comment.