Skip to content

Commit

Permalink
rectified imputation and filtering bug where the coverages were not c…
Browse files Browse the repository at this point in the history
…hanged from missing after imputation: solution was to change f64::NAN into f64::INFINITY for imputed loci
  • Loading branch information
jeffersonfparil committed Nov 2, 2023
1 parent 0a49043 commit dd6a337
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 14 deletions.
20 changes: 14 additions & 6 deletions src/imputation/adaptive_ld_knn_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,14 @@ impl GenotypesAndPhenotypes {
}
}
}
// Set missing coverages to infinity to mark imputed data
for i in 0..self.coverages.nrows() {
for j in 0..self.coverages.ncols() {
if self.coverages[(i, j)].is_nan() {
self.coverages[(i, j)] = f64::INFINITY
};
}
}
Ok(self)
}
}
Expand Down Expand Up @@ -342,7 +350,7 @@ pub fn impute_aLDkNN(
"Parsed the sync file into allele frequncies: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -355,7 +363,7 @@ pub fn impute_aLDkNN(
"Set missing loci below the minimum depth: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -368,7 +376,7 @@ pub fn impute_aLDkNN(
"Filtered out sparsest pools: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -381,7 +389,7 @@ pub fn impute_aLDkNN(
"Filtered out sparsest loci: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -400,7 +408,7 @@ pub fn impute_aLDkNN(
"Adaptive LD-kNN imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
// Remove 100% of the loci with missing data
Expand All @@ -414,7 +422,7 @@ pub fn impute_aLDkNN(
"Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
// Output
Expand Down
2 changes: 1 addition & 1 deletion src/imputation/filtering_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ impl GenotypesAndPhenotypes {
pub fn missing_rate(&mut self) -> io::Result<f64> {
let (n, l) = self.coverages.dim();
let sum = self.coverages.fold(0,|sum, &x| if x.is_nan(){sum + 1}else{sum});
Ok(sensible_round(sum as f64 / ((n*l) as f64), 2))
Ok(sensible_round(sum as f64 * 100.0 / ((n*l) as f64), 2))
}

pub fn set_missing_by_depth(
Expand Down
49 changes: 42 additions & 7 deletions src/imputation/mean_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ impl GenotypesAndPhenotypes {
} else {
mean_freqs
};
// println!("mean_freqs={:?}", mean_freqs);
for k in 0..freqs.ncols() {
for i in 0..n {
if self.intercept_and_allele_frequencies[(i, idx_ini + k)].is_nan() {
Expand All @@ -40,6 +39,14 @@ impl GenotypesAndPhenotypes {
}
}
}
// Set missing coverages to infinity to mark imputed data
for i in 0..self.coverages.nrows() {
for j in 0..self.coverages.ncols() {
if self.coverages[(i, j)].is_nan() {
self.coverages[(i, j)] = f64::INFINITY
};
}
}
Ok(self)
}
}
Expand All @@ -66,7 +73,7 @@ pub fn impute_mean(
"Parsed the sync file into allele frequncies: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -79,7 +86,7 @@ pub fn impute_mean(
"Set missing loci below the minimum depth: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -92,7 +99,7 @@ pub fn impute_mean(
"Filtered out sparsest pools: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -105,7 +112,7 @@ pub fn impute_mean(
"Filtered out sparsest loci: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
Expand All @@ -116,9 +123,12 @@ pub fn impute_mean(
"Mean value imputation: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);

// println!("genotypes_and_phenotypes={:?}", genotypes_and_phenotypes);

// Remove 100% of the loci with missing data
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
Expand All @@ -130,7 +140,7 @@ pub fn impute_mean(
"Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Missingness: {}% | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
genotypes_and_phenotypes.missing_rate().unwrap() * 100.0,
genotypes_and_phenotypes.missing_rate().unwrap(),
duration.as_secs()
);
// Output
Expand Down Expand Up @@ -229,3 +239,28 @@ mod tests {
// assert_eq!(0, 1);
}
}

// SYNC=/home/jeff/poolgen/tests/test_REMOVE_ME_BEFORE_PUSHING.sync
// PHEN=/home/jeff/poolgen/tests/test_REMOVE_ME_BEFORE_PUSHING.csv
// NCORES=7
// OUT=/home/jeff/poolgen/tests/test-MEAN_IMPUTE-REMOVE_ME_BEFORE_PUSHING.csv
// time cargo run -- impute \
// --imputation-method "mean" \
// -f ${SYNC} \
// -p ${PHEN} \
// --phen-delim , \
// --phen-name-col 0 \
// --phen-pool-size-col 1 \
// --phen-value-col 2 \
// --min-allele-frequency 0.0001 \
// --min-coverage 0 \
// --min-quality 0.01 \
// --max-missingness-rate 0.75 \
// --min-depth-set-to-missing 5 \
// --window-size-bp 1000000 \
// --window-slide-size-bp 1000000 \
// --min-loci-per-window 1 \
// --min-correlation 0.5 \
// --k-neighbours 5 \
// --n-threads ${NCORES} \
// -o ${OUT}

0 comments on commit dd6a337

Please sign in to comment.