diff --git a/src/base/pileup.rs b/src/base/pileup.rs index 078aa83..f6ec660 100644 --- a/src/base/pileup.rs +++ b/src/base/pileup.rs @@ -542,7 +542,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.0, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2], }; let mut filtered_pileup = pileup_line.clone(); diff --git a/src/base/structs_and_traits.rs b/src/base/structs_and_traits.rs index 5c1a068..b24452e 100644 --- a/src/base/structs_and_traits.rs +++ b/src/base/structs_and_traits.rs @@ -63,7 +63,7 @@ pub struct FilterStats { pub min_quality: f64, pub min_coverage: u64, pub min_allele_frequency: f64, - pub min_missingness_rate: f64, + pub max_missingness_rate: f64, pub pool_sizes: Vec, } diff --git a/src/base/sync.rs b/src/base/sync.rs index 403f2c1..4e04a1c 100644 --- a/src/base/sync.rs +++ b/src/base/sync.rs @@ -173,17 +173,6 @@ impl Filter for LocusCounts { .filter(|&&x| !x.is_nan()) .fold(0.0, |sum, &x| sum + x) }); // summation across the columns which means sum of all elements per row while ignoring NANs! - // // Make sure all pools have been convered - // if row_sums - // .iter() - // .fold(row_sums[0], |min, &x| if x < min { x } else { min }) - // == 0 - // { - // return Err(Error::new( - // ErrorKind::Other, - // "At least one pool has no coverage.", - // )); - // } let mut matrix: Array2 = Array2::from_elem((n, p), 0.0 as f64); for i in 0..n { for j in 0..p { @@ -224,13 +213,13 @@ impl Filter for LocusCounts { } // println!("self={:?}", self); // Filter by minimum coverage - // let sum_coverage = self.matrix.sum_axis(Axis(1)); // summation across the columns which means sum of all elements per row + // Summation across the columns which means sum of all elements per row while ignoring NANs! let sum_coverage = self.matrix.map_axis(Axis(1), |r| { r.map(|&x| x as f64) .iter() .filter(|&&x| !x.is_nan()) .fold(0.0, |sum, &x| sum + x) - }); // summation across the columns which means sum of all elements per row while ignoring NANs! + }); let min_sum_coverage = sum_coverage .iter() @@ -238,6 +227,14 @@ impl Filter for LocusCounts { if min_sum_coverage < filter_stats.min_coverage as f64 { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } + // // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus + // for i in 0..self.matrix.nrows() { + // if sum_coverage[i] < filter_stats.min_coverage as f64 { + // for j in 0..self.matrix.ncols() { + // self.matrix[(i, j)] = f64::NAN as u64; + // } + // } + // } // Filter by minimum allele frequency // Before anything else, we clone matrix of allele counts let mut matrix = self.matrix.clone(); @@ -297,7 +294,7 @@ impl Filter for LocusCounts { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } // Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold - if (n_missing_across_pools as f64 / n as f64) > filter_stats.min_missingness_rate { + if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } // Return the locus if it passed all the filtering steps @@ -468,7 +465,7 @@ impl Filter for LocusFrequencies { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } // Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold - if (n_missing_across_pools as f64 / n as f64) > filter_stats.min_missingness_rate { + if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } // Return the locus if it passed all the filtering steps @@ -509,6 +506,7 @@ impl Sort for LocusFrequencies { } impl RemoveMissing for LocusCountsAndPhenotypes { + // Remove pools with missing data in the phenotype file fn remove_missing(&mut self) -> io::Result<&mut Self> { let (n, k) = self.phenotypes.dim(); let n_ = self.pool_names.len(); @@ -551,6 +549,7 @@ impl RemoveMissing for LocusCountsAndPhenotypes { } impl RemoveMissing for GenotypesAndPhenotypes { + // Remove pools with missing data in the phenotype file fn remove_missing(&mut self) -> io::Result<&mut Self> { let (n, k) = self.phenotypes.dim(); let n_ = self.pool_names.len(); @@ -1573,7 +1572,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![20., 20., 20., 20., 20.], }; let mut filtered_counts = counts.clone(); @@ -1679,7 +1678,7 @@ mod tests { // min_quality: 0.005, // min_coverage: 0, // min_allele_frequency: 0.0001, - // min_missingness_rate: 0.2, + // max_missingness_rate: 0.2, // pool_sizes: phen.pool_sizes, // }; // let file_sync_phen = *(file_sync, file_phen).lparse().unwrap(); diff --git a/src/gp/cv.rs b/src/gp/cv.rs index 08964f5..2959f40 100644 --- a/src/gp/cv.rs +++ b/src/gp/cv.rs @@ -447,7 +447,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2], }; let _freqs = file_sync_phen diff --git a/src/gwas/correlation_test.rs b/src/gwas/correlation_test.rs index 2a2fa12..3578a53 100644 --- a/src/gwas/correlation_test.rs +++ b/src/gwas/correlation_test.rs @@ -151,7 +151,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0], }; let locus_counts = LocusCounts { diff --git a/src/gwas/gwalpha.rs b/src/gwas/gwalpha.rs index 8f529fc..c7fc66e 100644 --- a/src/gwas/gwalpha.rs +++ b/src/gwas/gwalpha.rs @@ -419,7 +419,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0], }; let locus_counts = LocusCounts { diff --git a/src/imputation/adaptive_ld_knn_imputation.rs b/src/imputation/adaptive_ld_knn_imputation.rs index 745c9f4..aae16a9 100644 --- a/src/imputation/adaptive_ld_knn_imputation.rs +++ b/src/imputation/adaptive_ld_knn_imputation.rs @@ -1,7 +1,152 @@ use crate::base::*; use crate::gwas::pearsons_correlation; use ndarray::{prelude::*, Zip}; -use std::io::{self, Error, ErrorKind}; +use std::io; + +fn calculate_euclidean_distances( + j: usize, + window_freqs: ArrayView2, + corr: ArrayView2, + min_correlation: &f64, +) -> io::Result> { + let (n, p) = window_freqs.dim(); + // Find the indexes of the linked (positively correlated) alleles to be used in calculating the distances between pools to find the nearest neighbours + let idx_linked_alleles = corr + .column(j) + .iter() + .enumerate() + .filter(|&(_i, x)| !x.is_nan()) + .filter(|&(_i, x)| x >= min_correlation) + .map(|(i, _x)| i) + .collect::>(); + // Use all the alleles to estimate distances between pools to find the nearest neighbours + let idx_linked_alleles = if idx_linked_alleles.len() < 2 { + (0..p).collect() + } else { + idx_linked_alleles + }; + // Calculate the Euclidean distances between pools using the most positively correlated alleles (or all the alleles if none passed the minimum correlation threshold or if there is less that 2 loci that are most correlated - these minimum 2 is the allele itself and another allele) + let mut dist: Array2 = Array2::from_elem((n, n), f64::NAN); + let window_freqs_linked_alleles = window_freqs.select(Axis(1), &idx_linked_alleles); + for i0 in 0..n { + let pool0 = window_freqs_linked_alleles.row(i0); + for i1 in i0..n { + let pool1 = window_freqs_linked_alleles.row(i1).to_owned(); + // Keep only the loci present in both pools + let (pool0, pool1): (Vec, Vec) = pool0 + .iter() + .zip(pool1.iter()) + .filter(|&(&x, &y)| (!x.is_nan()) & (!y.is_nan())) + .unzip(); + if pool0.len() == 0 { + continue; + } else { + let d = (&Array1::from_vec(pool0) - &Array1::from_vec(pool1)) + .map(|&x| x.powf(2.0)) + .sum() + .sqrt(); + match d.is_nan() { + true => continue, + false => { + dist[(i0, i1)] = d; + dist[(i1, i0)] = d; + } + }; + } + } + } + Ok(dist) +} + +fn find_k_nearest_neighbours( + j: usize, + i: usize, + k: &mut usize, + window_freqs: ArrayView2, + dist: ArrayView2, +) -> io::Result<(Vec, Vec, Array1)> { + let (n, p) = window_freqs.dim(); + // Find the k-nearest neighbours + let mut idx_pools: Vec = (0..n).collect(); + idx_pools.sort_by(|&j0, &j1| { + let a = match dist[(i, j0)].is_nan() { + true => f64::INFINITY, + false => dist[(i, j0)], + }; + let b = match dist[(i, j1)].is_nan() { + true => f64::INFINITY, + false => dist[(i, j1)], + }; + a.partial_cmp(&b).unwrap() + }); + let freqs_sorted_neighbours = window_freqs + .select(Axis(0), &idx_pools) + .column(j) + .to_owned(); + // Extract the allele frequencies and distances of the k-neighbours from sorted lists using idx_pools whose indexes are sorted + let mut freqs_k_neighbours = + freqs_sorted_neighbours.select(Axis(0), &(0..*k).collect::>()); + let mut dist_k_neighbours = dist + .column(i) + .select(Axis(0), &idx_pools) + .select(Axis(0), &(0..*k).collect::>()); + while freqs_k_neighbours.fold(0, |sum, &x| if x.is_nan() { sum + 1 } else { sum }) == *k { + *k += 1; + if *k > n { + break; + } + freqs_k_neighbours = + freqs_sorted_neighbours.select(Axis(0), &(0..*k).collect::>()); + dist_k_neighbours = dist + .column(i) + .select(Axis(0), &idx_pools) + .select(Axis(0), &(0..*k).collect::>()); + } + // Keep only non-missing frequencies and distances among the k-neighbours + let (freqs_k_neighbours, dist_k_neighbours): (Vec, Vec) = freqs_k_neighbours + .iter() + .zip(dist_k_neighbours.iter()) + .filter(|&(&x, &y)| (!x.is_nan()) & (!y.is_nan())) + .unzip(); + Ok(( + freqs_k_neighbours, + dist_k_neighbours, + freqs_sorted_neighbours, + )) +} + +fn correct_allele_frequencies_per_locus<'w>( + a: usize, + i: usize, + j: usize, + idx_ini: usize, + window_freqs: &'w mut Array2, + idx_window_head: &Vec, + idx_window_tail: &Vec, + loci_idx: &Vec, +) -> io::Result<&'w mut Array2> { + // Include the start of the next window, i.e. the marker for the end of the last locus in the current window + let loci_start_indexes_within_the_current_window = + loci_idx[idx_window_head[a]..(idx_window_tail[a] + 2)].to_vec(); + for j_ in 1..loci_start_indexes_within_the_current_window.len() { + // Are we at the last allele of the locus? + if (loci_start_indexes_within_the_current_window[j_] - 1) == (idx_ini + j) { + // If we are then we find the start of this locus, i.e. its local index + let j_ini = loci_start_indexes_within_the_current_window[j_ - 1] - idx_ini; + let freqs_sum = window_freqs + .slice(s![i, j_ini..(j + 1)]) + .fold(0.0, |sum, &x| if !x.is_nan() { sum + x } else { sum }) + + f64::EPSILON; + if freqs_sum != 1.0 { + for j_ in j_ini..(j + 1) { + window_freqs[(i, j_)] = window_freqs[(i, j_)] / freqs_sum; + } + } + break; + } + } + Ok(window_freqs) +} impl GenotypesAndPhenotypes { pub fn adaptive_ld_knn_imputation( @@ -12,9 +157,9 @@ impl GenotypesAndPhenotypes { min_correlation: &f64, k_neighbours: &u64, ) -> io::Result<&mut Self> { + self.check().unwrap(); // We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on. let (n, _p) = self.intercept_and_allele_frequencies.dim(); - let (n_, l) = self.coverages.dim(); // Define sliding windows let (loci_idx, loci_chr, loci_pos) = self.count_loci().unwrap(); let mut loci_chr_no_redundant_tail = loci_chr.to_owned(); @@ -29,16 +174,8 @@ impl GenotypesAndPhenotypes { min_loci_per_window, ) .unwrap(); - // println!("loci_idx={:?}", loci_idx); - // println!("idx_window_head={:?}", idx_window_head); - // println!("idx_window_tail={:?}", idx_window_tail); - // println!("loci_chr={:?}; loci_pos={:?}", loci_chr, loci_pos); - // println!("idx_window_head={:?}; idx_window_tail={:?}", idx_window_head, idx_window_tail); let w = idx_window_head.len(); let w_ = idx_window_tail.len(); - assert_eq!(n, n_); - assert_eq!(w, w_); - // Parallel processing per window let mut vec_windows_freqs: Vec> = vec![Array2::from_elem((1, 1), f64::NAN); w]; let idx_windows: Vec = (0..w).collect(); @@ -47,14 +184,9 @@ impl GenotypesAndPhenotypes { .par_for_each(|window_freqs, &a| { // for a in 0..idx_windows.len() { let idx_ini = loci_idx[idx_window_head[a]]; - let idx_fin = loci_idx[idx_window_tail[a]+1]; // add one so that we include the last part of the window! - // for i in idx_ini..idx_fin { - // println!("chr_{}:{}", self.chromosome[i], self.position[i]); - // } + let idx_fin = loci_idx[idx_window_tail[a] + 1]; // add one so that we include the last part of the window! let p = idx_fin - idx_ini; - // println!("p={}", p); *window_freqs = self - // let mut window_freqs = self .intercept_and_allele_frequencies .slice(s![.., idx_ini..idx_fin]) .to_owned(); @@ -75,149 +207,40 @@ impl GenotypesAndPhenotypes { }; } } - // println!("corr:\n{:?}", corr); for j in 0..p { - // if (self.chromosome[idx_ini..idx_fin][j] == "Chromosome1") & (self.position[idx_ini..idx_fin][j] == 28043452) { - // println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - // println!("self.chromosome[idx_ini..idx_fin][j]={:?}", self.chromosome[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][j]={:?}", self.position[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][p-1]={:?}", self.position[idx_ini..idx_fin][p-1]); - // println!("idx_fin={}", idx_fin); - // println!("window_freqs={:?}", &window_freqs); - // println!("corr={:?}", &corr); - // } if window_freqs .column(j) .fold(0, |sum, &x| if x.is_nan() { sum + 1 } else { sum }) == 0 { // No missing data on the locus - // println!("window_freqs.column(j)={:?}", window_freqs.column(j)); continue; } else { - // // Find indexes of the current locus which can have multiple alleles - // let locus_absolute_idx_ini = self.start_index_of_each_locus[idx_ini..idx_fin][j]; - // let locus_absolute_idx_fin = if j < (p-1) { - // self.start_index_of_each_locus[idx_ini..idx_fin][j + 1] - // } else { - // self.start_index_of_each_locus.len() as u64 - // }; - - // Find the indexes of the linked (positively correlated) alleles to be used in calculating the distances between pools to find the nearest neighbours - let idx_linked_alleles = corr - .column(j) - .iter() - .enumerate() - .filter(|&(_i, x)| !x.is_nan()) - .filter(|&(_i, x)| x >= min_correlation) - .map(|(i, _x)| i) - .collect::>(); - // Use all the alleles to estimate distances between pools to find the nearest neighbours - let idx_linked_alleles = if idx_linked_alleles.len() < 2 { - (0..p).collect() - } else { - idx_linked_alleles - }; - // Calculate the Euclidean distances between pools using the most positively correlated alleles (or all the alleles if none passed the minimum correlation threshold or if there is less that 2 loci that are most correlated - these minimum 2 is the allele itself and another allele) - let mut dist: Array2 = Array2::from_elem((n, n), f64::NAN); - let window_freqs_linked_alleles = - window_freqs.select(Axis(1), &idx_linked_alleles); - for i0 in 0..n { - let pool0 = window_freqs_linked_alleles.row(i0); - // println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - // println!("idx_linked_alleles:\n{:?}", &idx_linked_alleles); - // println!("pool0:\n{:?}", &pool0); - for i1 in i0..n { - let pool1 = window_freqs_linked_alleles.row(i1).to_owned(); - // Keep only the loci present in both pools - let (pool0, pool1): (Vec, Vec) = pool0 - .iter() - .zip(pool1.iter()) - .filter(|&(&x, &y)| (!x.is_nan()) & (!y.is_nan())) - .unzip(); - if pool0.len() == 0 { - continue; - } else { - let d = (&Array1::from_vec(pool0) - &Array1::from_vec(pool1)) - .map(|&x| x.powf(2.0)) - .sum() - .sqrt(); - match d.is_nan() { - true => continue, - false => { - dist[(i0, i1)] = d; - dist[(i1, i0)] = d; - } - }; - } - } - } - // println!("dist={:?}", dist); + let dist = calculate_euclidean_distances( + j, + window_freqs.view(), + corr.view(), + min_correlation, + ) + .unwrap(); // Now, let's find the pools needing imputation and impute them using the k-nearest neighbours for i in 0..n { let mut k = *k_neighbours as usize; // define the adaptive k for use later if window_freqs[(i, j)].is_nan() == false { - // println!("self.chromosome[idx_ini..idx_fin][j]={:?}", self.chromosome[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][j]={:?}", self.position[idx_ini..idx_fin][j]); - // println!("window_freqs[(i, j)]={:?}", window_freqs[(i, j)]); continue; } else { - // Find the k-nearest neighbours - let mut idx_pools: Vec = (0..n).collect(); - idx_pools.sort_by(|&j0, &j1| { - let a = match dist[(i, j0)].is_nan() { - true => f64::INFINITY, - false => dist[(i, j0)], - }; - let b = match dist[(i, j1)].is_nan() { - true => f64::INFINITY, - false => dist[(i, j1)], - }; - a.partial_cmp(&b).unwrap() - }); - // println!("dist.column(i):\n{:?}", dist.column(i)); - // println!("idx_pools:\n{:?}", idx_pools); - let freqs_sorted_neighbours = window_freqs - .select(Axis(0), &idx_pools) - .column(j) - .to_owned(); - // println!("freqs_sorted_neighbours:\n{:?}", freqs_sorted_neighbours); - // println!("freqs_unsorted_neighbours:\n{:?}", self.intercept_and_allele_frequencies.column(idx_ini + j)); - // Extract the allele frequencies and distances of the k-neighbours from sorted lists using idx_pools whose indexes are sorted - let mut freqs_k_neighbours = freqs_sorted_neighbours - .select(Axis(0), &(0..k).collect::>()); - let mut dist_k_neighbours = dist - .column(i) - .select(Axis(0), &idx_pools) - .select(Axis(0), &(0..k).collect::>()); - // println!("freqs_k_neighbours:\n{:?}", freqs_k_neighbours); - // println!("dist_k_neighbours:\n{:?}", dist_k_neighbours); - while freqs_k_neighbours.fold(0, |sum, &x| { - if x.is_nan() { - sum + 1 - } else { - sum - } - }) == k - { - k += 1; - if k > n { - break; - } - freqs_k_neighbours = freqs_sorted_neighbours - .select(Axis(0), &(0..k).collect::>()); - dist_k_neighbours = dist - .column(i) - .select(Axis(0), &idx_pools) - .select(Axis(0), &(0..k).collect::>()); - } - // Keep only non-missing frequencies and distances among the k-neighbours - let (freqs_k_neighbours, dist_k_neighbours): (Vec, Vec) = - freqs_k_neighbours - .iter() - .zip(dist_k_neighbours.iter()) - .filter(|&(&x, &y)| (!x.is_nan()) & (!y.is_nan())) - .unzip(); + let ( + freqs_k_neighbours, + dist_k_neighbours, + freqs_sorted_neighbours, + ) = find_k_nearest_neighbours( + j, + i, + &mut k, + window_freqs.view(), + dist.view(), + ) + .unwrap(); if freqs_k_neighbours.len() == 0 { // If the pool freqs and distances do not correspond to non-missing info then we simply use the mean across all non-missing pools window_freqs[(i, j)] = match freqs_sorted_neighbours @@ -230,13 +253,11 @@ impl GenotypesAndPhenotypes { None => f64::NAN, }; } else { - // println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - // println!("freqs_k_neighbours:\n{:?}", freqs_k_neighbours); - // println!("dist_k_neighbours:\n{:?}", dist_k_neighbours); // Impute the missing data using the weighted allele frequencies from the k-nearest neighbours let dist_sum = - dist_k_neighbours.iter().fold(0.0, |sum, &x| sum + x) + f64::EPSILON; // Add a very small value so that we don;t get undefined values when all distances are zero - // println!("dist_sum:\n{:?}", dist_sum); + dist_k_neighbours.iter().fold(0.0, |sum, &x| sum + x) + + f64::EPSILON; // Add a very small value so that we don;t get undefined values when all distances are zero + // println!("dist_sum:\n{:?}", dist_sum); let weights = dist_k_neighbours .iter() .map(|&x| 1.0 - (x / dist_sum) + f64::EPSILON) @@ -256,75 +277,33 @@ impl GenotypesAndPhenotypes { println!("values:\n{:?}", values); } window_freqs[(i, j)] = (&values * &weights).sum(); - // if (self.chromosome[idx_ini..idx_fin][j] == "Chromosome1") & (self.position[idx_ini..idx_fin][j] == 48525060) { - // println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - // println!("j={:?}", j); - // println!("i={:?}", i); - // println!("self.chromosome[idx_ini..idx_fin][j]={:?}", self.chromosome[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][j]={:?}", self.position[idx_ini..idx_fin][j]); - // println!("window_freqs={:?}", &window_freqs); - // println!("dist={:?}", &dist); - // println!("corr={:?}", &corr); - // println!("weights_sum={:?}", &weights_sum); - // println!("weights={:?}", &weights); - // println!("values={:?}", &values); - // println!("window_freqs[(i, j)]={:?}", window_freqs[(i, j)]); - // } } } // Need to correct for when the imputed allele frequencies do not add up to one! if j > 0 { - // Include the start of the next window, i.e. the marker for the end of the last locus in the current window - let loci_start_indexes_within_the_current_window = &loci_idx[idx_window_head[a]..(idx_window_tail[a]+2)]; - // if (self.chromosome[idx_ini..idx_fin][j] == "Chromosome1") & (self.position[idx_ini..idx_fin][j] == 28043452) { - // println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); - // println!("i={:?}", i); - // println!("j={:?}", j); - // println!("self.chromosome[idx_ini..idx_fin][j]={:?}", self.chromosome[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][j]={:?}", self.position[idx_ini..idx_fin][j]); - // println!("loci_start_indexes_within_the_current_window={:?}", loci_start_indexes_within_the_current_window); - // println!("window_freqs={:?}", window_freqs); - // } - for j_ in 1..loci_start_indexes_within_the_current_window.len() { - // Are we at the last allele of the locus? - if (loci_start_indexes_within_the_current_window[j_] - 1) == (idx_ini + j) { - // If we are then we find the start of this locus, i.e. its local index - let j_ini = loci_start_indexes_within_the_current_window[j_-1] - idx_ini; - let freqs_sum = window_freqs.slice(s![i, j_ini..(j+1)]).fold(0.0 ,|sum, &x| if !x.is_nan() {sum + x} else {sum}) + f64::EPSILON; - // if (self.chromosome[idx_ini..idx_fin][j] == "Chromosome1") & (self.position[idx_ini..idx_fin][j] == 28043452) { - // println!("----------------------------------------------------------------"); - // println!("i={:?}", i); - // println!("j={:?}", j); - // println!("j_ini={:?}", j_ini); - // println!("self.chromosome[idx_ini..idx_fin][j]={:?}", self.chromosome[idx_ini..idx_fin][j]); - // println!("self.position[idx_ini..idx_fin][j]={:?}", self.position[idx_ini..idx_fin][j]); - // println!("loci_start_indexes_within_the_current_window={:?}", loci_start_indexes_within_the_current_window); - // println!("window_freqs={:?}", window_freqs); - // println!("window_freqs.slice(s![i, j_ini..(j+1)])={:?}", window_freqs.slice(s![i, j_ini..(j+1)])); - // println!("freqs_sum={:?}", freqs_sum); - // } - if freqs_sum != 1.0 { - for j_ in j_ini..(j+1) { - window_freqs[(i, j_)] = window_freqs[(i, j_)] / freqs_sum; - } - } - break; - } - } + correct_allele_frequencies_per_locus( + a, + i, + j, + idx_ini, + window_freqs, + &idx_window_head, + &idx_window_tail, + &loci_idx, + ) + .unwrap(); } - } - } - } - // println!("*window_freqs:\n{:?}", *window_freqs); - // println!("*window_freqs:\n{:?}", window_freqs); - }); + } // Impute across pools with missing data + } // Impute if we have missing data + } // Iterate across alleles across loci within the window + }); // Parallel processing across windows // } // println!("vec_windows_freqs[0]:\n{:?}", vec_windows_freqs[0]); // Write-out the imputed data for a in 0..w { // Use the indexes of each locus let idx_ini = loci_idx[idx_window_head[a]]; - let idx_fin = loci_idx[idx_window_tail[a] + 1]; // add one so that we include the last part of the window! + let idx_fin = loci_idx[idx_window_tail[a] + 1]; // add one so that we include the last part of the window! let p = idx_fin - idx_ini; for i in 0..n { for j in 0..p { @@ -342,6 +321,8 @@ pub fn impute_aLDkNN( file_sync_phen: &FileSyncPhen, filter_stats: &FilterStats, min_depth_set_to_missing: &f64, + frac_top_missing_pools: &f64, + frac_top_missing_loci: &f64, window_size_bp: &u64, window_slide_size_bp: &u64, min_loci_per_window: &u64, @@ -359,26 +340,46 @@ pub fn impute_aLDkNN( let end = std::time::SystemTime::now(); let duration = end.duration_since(start).unwrap(); println!( - "Parsing the sync into allele frequncies: {} seconds", + "Parsed the sync file into allele frequncies: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), duration.as_secs() ); + let start = std::time::SystemTime::now(); + genotypes_and_phenotypes + .set_missing_by_depth(min_depth_set_to_missing) + .unwrap(); + let end = std::time::SystemTime::now(); + let duration = end.duration_since(start).unwrap(); println!( - "genotypes_and_phenotypes.intercept_and_allele_frequencies:\n{:?}", - genotypes_and_phenotypes.intercept_and_allele_frequencies + "Set missing loci below the minimum depth: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + duration.as_secs() ); let start = std::time::SystemTime::now(); genotypes_and_phenotypes - .set_missing_by_depth(min_depth_set_to_missing) + .filter_out_top_missing_pools(frac_top_missing_pools) .unwrap(); let end = std::time::SystemTime::now(); let duration = end.duration_since(start).unwrap(); println!( - "Set missing loci below the minimum depth: {} seconds", + "Filtered out sparsest pools: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), duration.as_secs() ); + let start = std::time::SystemTime::now(); + genotypes_and_phenotypes + .filter_out_top_missing_loci(frac_top_missing_loci) + .unwrap(); + let end = std::time::SystemTime::now(); + let duration = end.duration_since(start).unwrap(); println!( - "genotypes_and_phenotypes.intercept_and_allele_frequencies:\n{:?}", - genotypes_and_phenotypes.intercept_and_allele_frequencies + "Filtered out sparsest loci: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + duration.as_secs() ); let start = std::time::SystemTime::now(); genotypes_and_phenotypes @@ -392,7 +393,27 @@ pub fn impute_aLDkNN( .unwrap(); let end = std::time::SystemTime::now(); let duration = end.duration_since(start).unwrap(); - println!("Imputation: {} seconds", duration.as_secs()); + println!( + "Adaptive LD-kNN imputation: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + duration.as_secs() + ); + // Removing missing data, i.e. loci that cannot be imputed + // First remove missing data + let start = std::time::SystemTime::now(); + genotypes_and_phenotypes + .filter_out_top_missing_loci(&1.00) + .unwrap(); // Remove 100% of the loci with missing data + let end = std::time::SystemTime::now(); + let duration = end.duration_since(start).unwrap(); + println!( + "Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Duration: {} seconds", + genotypes_and_phenotypes.coverages.nrows(), + genotypes_and_phenotypes.coverages.ncols(), + duration.as_secs() + ); + // Output let out = genotypes_and_phenotypes .write_csv(filter_stats, keep_p_minus_1, out, n_threads) .unwrap(); @@ -423,7 +444,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![20., 20., 20., 20., 20.], }; let n_threads = 2; @@ -446,6 +467,8 @@ mod tests { "Before imputation:\n{:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies ); + let frac_top_missing_pools = 0.0; + let frac_top_missing_loci = 0.2; let window_size_bp = 1e6 as u64; let window_slide_size_bp = window_size_bp; let min_loci_per_window = 1; @@ -469,6 +492,8 @@ mod tests { &file_sync_phen, &filter_stats, &min_depth_set_to_missing, + &frac_top_missing_pools, + &frac_top_missing_loci, &window_size_bp, &window_slide_size_bp, &min_loci_per_window, @@ -480,23 +505,38 @@ mod tests { .unwrap(); assert_eq!(outname, "test-impute_aLDkNN.csv".to_owned()); // Do better!!! Load data - thus working on improving load_table() - // NOTE: ADD TESTS TO MAKE SURE FREQUENCIES IN EACH LOCUS ADDS UP TO ONE - + println!("frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])={:?}", frequencies_and_phenotypes.intercept_and_allele_frequencies.slice(s![0..5, 39..42])); assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 1)], - 0.20596581159779487 + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .slice(s![0..5, 1..3]) + .sum_axis(Axis(1)) + .map(|x| sensible_round(*x, 2)), + Array1::from_elem(5, 1.0) ); assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 2)], - 0.7940341884022049 + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .slice(s![0..5, 39..42]) + .sum_axis(Axis(1)) + .map(|x| sensible_round(*x, 2)), + Array1::from_elem(5, 1.0) ); assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 13923)], - 0.02710654974164676 + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .slice(s![0..5, 119..121]) + .sum_axis(Axis(1)) + .map(|x| sensible_round(*x, 2)), + Array1::from_elem(5, 1.0) ); assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 13924)], - 0.972893450258353 + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .slice(s![0..5, 400..402]) + .sum_axis(Axis(1)) + .map(|x| sensible_round(*x, 2)), + Array1::from_elem(5, 1.0) ); // assert_eq!(0, 1); } diff --git a/src/imputation/filtering_missing.rs b/src/imputation/filtering_missing.rs new file mode 100644 index 0000000..c4e1675 --- /dev/null +++ b/src/imputation/filtering_missing.rs @@ -0,0 +1,354 @@ +use ndarray::prelude::*; +use std::io::{self, Error, ErrorKind}; + +use crate::base::*; + +impl GenotypesAndPhenotypes { + pub fn set_missing_by_depth( + &mut self, + min_depth_set_to_missing: &f64, + ) -> io::Result<&mut Self> { + self.check().unwrap(); + let (n, p) = self.intercept_and_allele_frequencies.dim(); + let (_n, l) = self.coverages.dim(); + let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().unwrap(); + for i in 0..n { + for j in 0..l { + if self.coverages[(i, j)] < *min_depth_set_to_missing { + self.coverages[(i, j)] = f64::NAN; + // Use the indexes of the locus to set missing values to all alleles in the locus + let idx_ini = loci_idx[j] as usize; + let idx_fin = if j < (l - 1) { + loci_idx[j + 1] as usize + } else { + loci_idx[l - 1] as usize + }; + for k in idx_ini..idx_fin { + self.intercept_and_allele_frequencies[(i, k)] = f64::NAN; + } + } + } + } + self.check().unwrap(); + Ok(self) + } + + pub fn filter_out_top_missing_pools( + &mut self, + frac_top_missing_pools: &f64, + ) -> io::Result<&mut Self> { + self.check().unwrap(); + let n = self.intercept_and_allele_frequencies.nrows(); + let p = self.intercept_and_allele_frequencies.ncols() - 1; + let missingness_per_pool: Array1 = self + .intercept_and_allele_frequencies + .map_axis(Axis(1), |x| { + x.fold( + 0 as u64, + |n_missing, &x| if x.is_nan() { n_missing + 1 } else { n_missing }, + ) + }) + .map(|&x| x as f64 / p as f64); + // println!("missingness_per_pool={:?}", missingness_per_pool); + // Define the total number of pools that will be retained + let n_missing = + missingness_per_pool.fold(0.0, |sum, &x| if x > 0.0 { sum + 1.0 } else { sum }); + let n_after_filtering = n - (n_missing * frac_top_missing_pools).ceil() as usize; + if n_after_filtering == 0 { + return Err(Error::new( + ErrorKind::Other, + "No pools left after filtering, please reduce 'frac_top_missing_pools'".to_owned(), + )); + } + // Sort by increasing missingness + let mut idx = (0..n).collect::>(); + idx.sort_by(|&a, &b| { + missingness_per_pool[a] + .partial_cmp(&missingness_per_pool[b]) + .unwrap() + }); + // println!("idx={:?}", idx); + // Omit the pools with high missingness rates + idx = idx[0..n_after_filtering].to_vec(); + // println!("idx={:?}", idx); + // Sort the indexes so that we maintain the order of the pools + idx.sort(); + // println!("idx={:?}", idx); + let mut new_intercept_and_allele_frequencies = + Array2::from_elem((n_after_filtering, p + 1), f64::NAN); + let mut new_phenotypes: Array2 = + Array2::from_elem((n_after_filtering, self.phenotypes.ncols()), f64::NAN); + let mut new_pool_names: Vec = vec![]; + let mut new_coverages: Array2 = + Array2::from_elem((n_after_filtering, self.coverages.ncols()), f64::NAN); + for i in 0..n_after_filtering { + // Pool index in the old data + let i_ = idx[i]; + // Intercept + new_intercept_and_allele_frequencies[(i, 0)] = 1.0; + for j in 1..(p + 1) { + new_intercept_and_allele_frequencies[(i, j)] = + self.intercept_and_allele_frequencies[(i_, j)]; + } + for k in 0..new_phenotypes.ncols() { + new_phenotypes[(i, k)] = self.phenotypes[(i_, k)]; + } + new_pool_names.push(self.pool_names[i_].to_owned()); + for l in 0..new_coverages.ncols() { + new_coverages[(i, l)] = self.coverages[(i_, l)]; + } + } + self.intercept_and_allele_frequencies = new_intercept_and_allele_frequencies; + self.phenotypes = new_phenotypes; + self.pool_names = new_pool_names; + self.coverages = new_coverages; + // println!("self.chromosome.len()={:?}", self.chromosome.len()); + // println!("self.position.len()={:?}", self.position.len()); + // println!("self.allele.len()={:?}", self.allele.len()); + // println!("self.intercept_and_allele_frequencies.dim()={:?}", self.intercept_and_allele_frequencies.dim()); + // println!("self.coverages.len()={:?}", self.coverages.dim()); + self.check().unwrap(); + Ok(self) + } + + // For thinning the data before imputation and also removing completely missing loci after imputation + pub fn filter_out_top_missing_loci( + &mut self, + frac_top_missing_loci: &f64, + ) -> io::Result<&mut Self> { + self.check().unwrap(); + let n = self.intercept_and_allele_frequencies.nrows(); + let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().unwrap(); + let l = loci_idx.len() - 1; // Less one for the trailing locus + // Define loci start and end indexes + let loci_idx_ini = loci_idx[0..l].to_vec(); + let loci_idx_fin = loci_idx[1..(l + 1)].to_vec(); + let missingness_per_locus: Array1 = self + .coverages + .map_axis(Axis(0), |x| { + x.fold( + 0 as u64, + |n_missing, &x| if x.is_nan() { n_missing + 1 } else { n_missing }, + ) + }) + .map(|&x| x as f64 / n as f64); + // println!("missingness_per_locus={:?}", missingness_per_locus); + // Define the total number of pools that will be retained + let l_missing = + missingness_per_locus.fold(0.0, |sum, &x| if x > 0.0 { sum + 1.0 } else { sum }); + // Define the number of loci kept after filtering + let l_after_filtering = l - (l_missing * frac_top_missing_loci).ceil() as usize; + if l_after_filtering == 0 { + return Err(Error::new( + ErrorKind::Other, + "No loci left after filtering, please reduce 'frac_top_missing_loci'".to_owned(), + )); + } + // Sort by increasing missingness + let mut idx = (0..l).collect::>(); + idx.sort_by(|&a, &b| { + missingness_per_locus[a] + .partial_cmp(&missingness_per_locus[b]) + .unwrap() + }); + // println!("idx={:?}", idx); + // Omit the loci with high missingness rates + idx = idx[0..l_after_filtering].to_vec(); + // println!("idx={:?}", idx); + // Sort the indexes so that we maintain the order of the loci + idx.sort(); + // println!("idx={:?}", idx); + // Find the number of columns first using the first pool and also set the loci coordinates + let mut vec_intercept_and_allele_frequencies_pool_0: Vec = + vec![self.intercept_and_allele_frequencies[(0, 0)]]; + let mut new_chromosome: Vec = vec![self.chromosome[0].to_owned()]; + let mut new_position: Vec = vec![self.position[0]]; + let mut new_allele: Vec = vec![self.allele[0].to_owned()]; + let i = 0; + for j in 0..l_after_filtering { + let j_ = idx[j]; + // Use the indexes of the locus to set missing values to all alleles in the locus + let idx_ini = loci_idx_ini[j_]; + let idx_fin = loci_idx_fin[j_]; + for k in idx_ini..idx_fin { + vec_intercept_and_allele_frequencies_pool_0 + .push(self.intercept_and_allele_frequencies[(i, k)]); + new_chromosome.push(self.chromosome[k].to_owned()); + new_position.push(self.position[k]); + new_allele.push(self.allele[k].to_owned()); + } + } + // Populate the intercept and allele frequencies matrix + let p = vec_intercept_and_allele_frequencies_pool_0.len(); // includes the intercept + let mut new_intercept_and_allele_frequencies = Array2::from_elem((n, p), f64::NAN); + let mut new_coverages = Array2::from_elem((n, l_after_filtering), f64::NAN); + for i in 0..n { + let mut k_new = 1; // Start at the second index after the intercept + for j in 0..l_after_filtering { + // Intercept + new_intercept_and_allele_frequencies[(i, 0)] = 1.0; + let j_ = idx[j]; + let idx_ini = loci_idx_ini[j_]; + let idx_fin = loci_idx_fin[j_]; + for k in 0..(idx_fin - idx_ini) { + let k_old = idx_ini + k; + new_intercept_and_allele_frequencies[(i, k_new)] = + self.intercept_and_allele_frequencies[(i, k_old)]; + k_new += 1; + } + // Coverages + new_coverages[(i, j)] = self.coverages[(i, j_)]; + } + } + self.chromosome = new_chromosome; + self.position = new_position; + self.allele = new_allele; + self.intercept_and_allele_frequencies = new_intercept_and_allele_frequencies; + self.coverages = new_coverages; + // println!("self={:?}", self); + // println!("self.chromosome.len()={:?}", self.chromosome.len()); + // println!("self.position.len()={:?}", self.position.len()); + // println!("self.allele.len()={:?}", self.allele.len()); + // println!("self.intercept_and_allele_frequencies.dim()={:?}", self.intercept_and_allele_frequencies.dim()); + // println!("self.coverages.len()={:?}", self.coverages.dim()); + self.check().unwrap(); + Ok(self) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_filtering_missing() { + let file_sync = FileSync { + filename: "./tests/test.sync".to_owned(), + test: "load".to_owned(), + }; + let file_phen = FilePhen { + filename: "./tests/test.csv".to_owned(), + delim: ",".to_owned(), + names_column_id: 0, + sizes_column_id: 1, + trait_values_column_ids: vec![2, 3], + format: "default".to_owned(), + }; + let file_sync_phen = *(file_sync, file_phen).lparse().unwrap(); + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + // NOTE: MAKE SURE THAT FilterStats is set to no filtering except by zero frequency alleles as in below: + // AND THAT WE KEEP ALL NON-ZERO ALLELES ACROSS ALL ENTRIES! + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + let filter_stats = FilterStats { + remove_ns: false, + min_quality: 1.0, + min_coverage: 0, + min_allele_frequency: 0.000001, + max_missingness_rate: 0.0, + pool_sizes: vec![20., 20., 20., 20., 20.], + }; + let n_threads = 2; + let mut frequencies_and_phenotypes = file_sync_phen + .into_genotypes_and_phenotypes(&filter_stats, false, &n_threads) + .unwrap(); + let min_depth_set_to_missing = 5.0; + frequencies_and_phenotypes + .set_missing_by_depth(&min_depth_set_to_missing) + .unwrap(); + // println!("frequencies_and_phenotypes={:?}", frequencies_and_phenotypes); + println!( + "frequencies_and_phenotypes.chromosome[0..5]={:?}", + &frequencies_and_phenotypes.chromosome[0..5] + ); + println!( + "frequencies_and_phenotypes.position[0..5]={:?}", + &frequencies_and_phenotypes.position[0..5] + ); + println!( + "frequencies_and_phenotypes.allele[0..5]={:?}", + &frequencies_and_phenotypes.allele[0..5] + ); + println!( + "frequencies_and_phenotypes.intercept_and_allele_frequencies={:?}", + frequencies_and_phenotypes.intercept_and_allele_frequencies + ); + println!( + "frequencies_and_phenotypes.coverages={:?}", + frequencies_and_phenotypes.coverages + ); + assert_eq!( + frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 1)].is_nan(), + true + ); + assert_eq!( + frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 4)], + 1.0 / 21.0 + ); + assert_eq!( + frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 13)], + 2.0 / 10.0 + ); + assert_eq!( + frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 18)].is_nan(), + true + ); + assert_eq!( + frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 18)].is_nan(), + true + ); + + println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); + println!("Before filtering out pools"); + println!( + "frequencies_and_phenotypes.intercept_and_allele_frequencies:\n{:?}", + frequencies_and_phenotypes.intercept_and_allele_frequencies + ); + println!( + "frequencies_and_phenotypes.coverages:\n{:?}", + frequencies_and_phenotypes.coverages + ); + frequencies_and_phenotypes + .filter_out_top_missing_pools(&0.20) + .unwrap(); + assert_eq!( + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .nrows(), + 4 + ); + println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); + println!("After filtering out pools but before filtering out loci"); + println!( + "frequencies_and_phenotypes.intercept_and_allele_frequencies:\n{:?}", + frequencies_and_phenotypes.intercept_and_allele_frequencies + ); + println!( + "frequencies_and_phenotypes.coverages:\n{:?}", + frequencies_and_phenotypes.coverages + ); + frequencies_and_phenotypes + .filter_out_top_missing_loci(&1.00) + .unwrap(); + assert_eq!( + frequencies_and_phenotypes + .intercept_and_allele_frequencies + .ncols(), + 11000 + ); + println!("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); + println!("After filtering out pools and loci"); + println!( + "frequencies_and_phenotypes.intercept_and_allele_frequencies:\n{:?}", + frequencies_and_phenotypes.intercept_and_allele_frequencies + ); + println!( + "frequencies_and_phenotypes.coverages:\n{:?}", + frequencies_and_phenotypes.coverages + ); + // assert_eq!(0, 1); + } +} diff --git a/src/imputation/mean_imputation.rs b/src/imputation/mean_imputation.rs index 10de0b3..c8d9854 100644 --- a/src/imputation/mean_imputation.rs +++ b/src/imputation/mean_imputation.rs @@ -10,7 +10,7 @@ impl GenotypesAndPhenotypes { let (loci_idx, loci_chr, loci_pos) = self.count_loci().unwrap(); let l_ = loci_idx.len(); assert_eq!(n, n_); - assert_eq!(l, l_-1); // less trailing locus + assert_eq!(l, l_ - 1); // less trailing locus for j in 0..l { // Use the indexes of each locus let idx_ini = loci_idx[j] as usize; @@ -102,7 +102,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![20., 20., 20., 20., 20.], }; let n_threads = 2; diff --git a/src/imputation/mod.rs b/src/imputation/mod.rs index b809b50..b0a613d 100644 --- a/src/imputation/mod.rs +++ b/src/imputation/mod.rs @@ -1,5 +1,5 @@ -pub use self::{adaptive_ld_knn_imputation::*, mean_imputation::*, set_missing_by_depth::*}; +pub use self::{adaptive_ld_knn_imputation::*, filtering_missing::*, mean_imputation::*}; mod adaptive_ld_knn_imputation; +mod filtering_missing; mod mean_imputation; -mod set_missing_by_depth; diff --git a/src/imputation/set_missing_by_depth.rs b/src/imputation/set_missing_by_depth.rs deleted file mode 100644 index f5b95db..0000000 --- a/src/imputation/set_missing_by_depth.rs +++ /dev/null @@ -1,124 +0,0 @@ -use ndarray::prelude::*; -use std::io::{self, Error, ErrorKind}; - -use crate::base::*; - -impl GenotypesAndPhenotypes { - pub fn set_missing_by_depth( - &mut self, - min_depth_set_to_missing: &f64, - ) -> io::Result<&mut Self> { - let (n, p) = self.intercept_and_allele_frequencies.dim(); - let (n_, l) = self.coverages.dim(); - let (loci_idx, loci_chr, loci_pos) = self.count_loci().unwrap(); - let l_ = loci_idx.len(); - // println!("n={}; p={}; n_={}; l={}; l_={}", n, p, n_, l, l_); - assert_eq!(n, n_); - assert_eq!(l, l_-1); // less trailing locus - for i in 0..n { - for j in 0..l { - if self.coverages[(i, j)] < *min_depth_set_to_missing { - // Use the indexes of the locus to set missing values to all alleles in the locus - let idx_ini = loci_idx[j] as usize; - let idx_fin = if j < (l - 1) { - loci_idx[j + 1] as usize - } else { - loci_idx[l - 1] as usize - }; - for k in idx_ini..idx_fin { - self.intercept_and_allele_frequencies[(i, k)] = f64::NAN; - } - } - } - } - Ok(self) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[cfg(test)] -mod tests { - use super::*; - #[test] - fn test_set_missing() { - let file_sync = FileSync { - filename: "./tests/test.sync".to_owned(), - test: "load".to_owned(), - }; - let file_phen = FilePhen { - filename: "./tests/test.csv".to_owned(), - delim: ",".to_owned(), - names_column_id: 0, - sizes_column_id: 1, - trait_values_column_ids: vec![2, 3], - format: "default".to_owned(), - }; - let file_sync_phen = *(file_sync, file_phen).lparse().unwrap(); - //////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////// - // NOTE: MAKE SURE THAT FilterStats is set to no filtering except by zero frequency alleles as in below: - // AND THAT WE KEEP ALL NON-ZERO ALLELES ACROSS ALL ENTRIES! - //////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////// - let filter_stats = FilterStats { - remove_ns: false, - min_quality: 1.0, - min_coverage: 0, - min_allele_frequency: 0.000001, - min_missingness_rate: 0.0, - pool_sizes: vec![20., 20., 20., 20., 20.], - }; - let n_threads = 2; - let mut frequencies_and_phenotypes = file_sync_phen - .into_genotypes_and_phenotypes(&filter_stats, false, &n_threads) - .unwrap(); - let min_depth_set_to_missing = 5.0; - let _ = frequencies_and_phenotypes - .set_missing_by_depth(&min_depth_set_to_missing) - .unwrap(); - // println!("frequencies_and_phenotypes={:?}", frequencies_and_phenotypes); - println!( - "frequencies_and_phenotypes.chromosome[0..5]={:?}", - &frequencies_and_phenotypes.chromosome[0..5] - ); - println!( - "frequencies_and_phenotypes.position[0..5]={:?}", - &frequencies_and_phenotypes.position[0..5] - ); - println!( - "frequencies_and_phenotypes.allele[0..5]={:?}", - &frequencies_and_phenotypes.allele[0..5] - ); - println!( - "frequencies_and_phenotypes.intercept_and_allele_frequencies={:?}", - frequencies_and_phenotypes.intercept_and_allele_frequencies - ); - println!( - "frequencies_and_phenotypes.coverages={:?}", - frequencies_and_phenotypes.coverages - ); - assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 1)].is_nan(), - true - ); - assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(0, 4)], - 1.0 / 21.0 - ); - assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 13)], - 2.0 / 10.0 - ); - assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 18)].is_nan(), - true - ); - assert_eq!( - frequencies_and_phenotypes.intercept_and_allele_frequencies[(3, 18)].is_nan(), - true - ); - // assert_eq!(0, 1); - } -} diff --git a/src/main.rs b/src/main.rs index 1f072b2..85ec378 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,15 +38,15 @@ struct Args { /// Minimum base quality in terms of base calling error rate, i.e. lower values means higher quality #[clap(long, default_value_t = 0.01)] min_quality: f64, - /// Minimum depth of coverage + /// Minimum depth of coverage (loci with at least one pool below this threshold will be omitted) #[clap(long, default_value_t = 1)] min_coverage: u64, /// Minimum allele frequency for associating the genotypes with the phenotype/s #[clap(long, default_value_t = 0.001)] min_allele_frequency: f64, - /// Minimum missingness rate, i.e. the minimum fraction of pools missing in each locus + /// Maximum missingness rate, i.e. the maximum fraction of pools missing in each locus #[clap(long, default_value_t = 0.0)] - min_missingness_rate: f64, + max_missingness_rate: f64, /// Keep ambiguous reads during SNP filtering, i.e. keep them coded as Ns #[clap(long, action)] keep_ns: bool, @@ -109,6 +109,12 @@ struct Args { /// Imputation parameter, i.e. minimum depth to set to missing data for imputation #[clap(long, default_value_t = 5.00)] min_depth_set_to_missing: f64, + /// Imputation parameter, i.e. fraction of the pools with missing data to be ommited after sorting by rate of missingness + #[clap(long, default_value_t = 0.10)] + frac_top_missing_pools: f64, + /// Imputation parameter, i.e. fraction of the loci with missing data to be ommited after sorting by rate of missingness + #[clap(long, default_value_t = 0.10)] + frac_top_missing_loci: f64, /// Imputation parameter, i.e. imputation method, select from "mean" for simple imputation using mean allele frequencies across non-missing pools, or "aLD-kNNi" for adaptive linkage disequillibrium (estimated using correlations within a window) k-nearest neighbour weighted allele frequencies imputation #[clap(long, default_value = "aLD-kNNi")] imputation_method: String, @@ -181,7 +187,7 @@ fn main() { min_quality: args.min_quality, min_coverage: args.min_coverage, min_allele_frequency: args.min_allele_frequency, - min_missingness_rate: args.min_missingness_rate, + max_missingness_rate: args.max_missingness_rate, pool_sizes: phen.pool_sizes.clone(), }; if args.analysis == String::from("pileup2sync") { @@ -314,6 +320,8 @@ fn main() { &file_sync_phen, &filter_stats, &args.min_depth_set_to_missing, + &args.frac_top_missing_pools, + &args.frac_top_missing_loci, &args.window_size_bp, &args.window_slide_size_bp, &args.min_loci_per_window, diff --git a/src/popgen/gudmc.rs b/src/popgen/gudmc.rs index 17ea767..35a3f21 100644 --- a/src/popgen/gudmc.rs +++ b/src/popgen/gudmc.rs @@ -487,7 +487,7 @@ mod tests { min_quality: 0.01, min_coverage: 1, min_allele_frequency: 0.001, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![42.0, 42.0, 42.0, 42.0, 42.0], }; let genotypes_and_phenotypes = file_sync_phen diff --git a/src/tables/chisq_test.rs b/src/tables/chisq_test.rs index c1307ca..9f68cf0 100644 --- a/src/tables/chisq_test.rs +++ b/src/tables/chisq_test.rs @@ -70,7 +70,7 @@ mod tests { min_quality: 0.01, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![0.2, 0.2, 0.2, 0.2], }; // Outputs diff --git a/src/tables/fisher_exact_test.rs b/src/tables/fisher_exact_test.rs index b4377c9..5f6fc80 100644 --- a/src/tables/fisher_exact_test.rs +++ b/src/tables/fisher_exact_test.rs @@ -151,7 +151,7 @@ mod tests { min_quality: 0.005, min_coverage: 1, min_allele_frequency: 0.005, - min_missingness_rate: 0.0, + max_missingness_rate: 0.0, pool_sizes: vec![0.2, 0.2, 0.2], }; let mut locus_counts = LocusCounts {