Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: refactor calculate_gather_stats to disallow repeated downsampling #3352

Merged
merged 29 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d9eeafd
implement downsample_max_hash in terms of downsample_scaled
ctb Oct 5, 2024
4ab87e4
adjust references & use clone too much
ctb Oct 6, 2024
ffa9683
switch to using scaled; avoid unnecessary clones?
ctb Oct 6, 2024
5f70fcc
update to fix? scaled=0 situation
ctb Oct 11, 2024
17f56a6
error on upsampling
ctb Oct 11, 2024
b60fbb0
Merge branch 'latest' of github.com:sourmash-bio/sourmash into refact…
ctb Oct 11, 2024
ffd8ffe
define new error
ctb Oct 11, 2024
b67fc04
bump sourmash core to 0.16.0 b/c of API change
ctb Oct 11, 2024
9b9fc5a
update include/sourmash.h
ctb Oct 11, 2024
a0b93eb
add some downsampling-focused tests
ctb Oct 11, 2024
79e7b7d
rename error type; test specific error type
ctb Oct 11, 2024
b118779
cargo fmt
ctb Oct 11, 2024
464ec60
fix by doing too much downsampling
ctb Oct 11, 2024
641d9fc
upd
ctb Oct 11, 2024
a9b91c9
upd
ctb Oct 11, 2024
79afb85
simplify
ctb Oct 11, 2024
ddcb049
upd again
ctb Oct 11, 2024
6eb86a3
simplify
ctb Oct 12, 2024
b47a6c4
comment
ctb Oct 12, 2024
62f03eb
add KmerMinHashBTree test
ctb Oct 12, 2024
ff4eeda
feat: Implement TryInto to convert Signature and SigStore into KmerMi…
luizirber Oct 13, 2024
e4e5555
update include/sourmash.h
ctb Oct 13, 2024
ceaea39
fix two rs errors
ctb Oct 13, 2024
0eeca48
refactor calculate_gather_stats to take ds query & return isect
ctb Oct 14, 2024
cd0a209
fix format
ctb Oct 14, 2024
2da0084
fix name of query to remaining_query
ctb Oct 14, 2024
405c518
Merge branch 'latest' of github.com:sourmash-bio/sourmash into gather…
ctb Oct 14, 2024
03102a5
Merge branch 'latest' of github.com:sourmash-bio/sourmash into gather…
ctb Oct 15, 2024
72d2044
fix merge mistakes
ctb Oct 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
use log::trace;
use serde::{Deserialize, Serialize};
use stats::{median, stddev};
use std::cmp::max;
use typed_builder::TypedBuilder;

use crate::ani_utils::{ani_ci_from_containment, ani_from_containment};
Expand All @@ -28,6 +27,7 @@
use crate::signature::SigsTrait;
use crate::sketch::minhash::KmerMinHash;
use crate::storage::SigStore;
use crate::Error::CannotUpsampleScaled;
use crate::Result;

#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)]
Expand Down Expand Up @@ -209,7 +209,7 @@
#[allow(clippy::too_many_arguments)]
pub fn calculate_gather_stats(
orig_query: &KmerMinHash,
query: KmerMinHash,
remaining_query: KmerMinHash,
match_sig: SigStore,
match_size: usize,
gather_result_rank: usize,
Expand All @@ -218,29 +218,31 @@
calc_abund_stats: bool,
calc_ani_ci: bool,
confidence: Option<f64>,
) -> Result<GatherResult> {
) -> Result<(GatherResult, (Vec<u64>, u64))> {
// get match_mh
let match_mh = match_sig.minhash().expect("cannot retrieve sketch");

let max_scaled = max(match_mh.scaled(), query.scaled());
let query = query
.downsample_scaled(max_scaled)
.expect("cannot downsample query");
// it's ok to downsample match, but query is often big and repeated,
// so we do not allow downsampling of query in this function.
if match_mh.scaled() > remaining_query.scaled() {
return Err(CannotUpsampleScaled);

Check warning on line 228 in src/core/src/index/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/index/mod.rs#L228

Added line #L228 was not covered by tests
}

let match_mh = match_mh
.clone()
.downsample_scaled(max_scaled)
.downsample_scaled(remaining_query.scaled())
.expect("cannot downsample match");

// calculate intersection
let isect = match_mh
.intersection(&query)
.intersection(&remaining_query)
.expect("could not do intersection");
let isect_size = isect.0.len();
trace!("isect_size: {}", isect_size);
trace!("query.size: {}", query.size());
trace!("query.size: {}", remaining_query.size());

//bp remaining in subtracted query
let remaining_bp = (query.size() - isect_size) * query.scaled() as usize;
let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
Expand Down Expand Up @@ -300,7 +302,7 @@
// If abundance, calculate abund-related metrics (vs current query)
if calc_abund_stats {
// take abunds from subtracted query
let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&query) {
let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&remaining_query) {
Ok((abunds, unique_weighted_found)) => (abunds, unique_weighted_found),
Err(e) => {
return Err(e);
Expand Down Expand Up @@ -347,7 +349,7 @@
.sum_weighted_found(sum_total_weighted_found)
.total_weighted_hashes(total_weighted_hashes)
.build();
Ok(result)
Ok((result, isect))
}

#[cfg(test)]
Expand Down Expand Up @@ -403,7 +405,7 @@
let gather_result_rank = 0;
let calc_abund_stats = true;
let calc_ani_ci = false;
let result = calculate_gather_stats(
let (result, _isect) = calculate_gather_stats(
&orig_query,
query,
match_sig.into(),
Expand All @@ -416,6 +418,7 @@
None,
)
.unwrap();

// first, print all results
assert_eq!(result.filename(), "match-filename");
assert_eq!(result.name(), "match-name");
Expand Down
33 changes: 19 additions & 14 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::cmp::max;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
Expand Down Expand Up @@ -393,30 +394,28 @@ impl RevIndexOps for RevIndex {
}

let match_sig = self.collection.sig_for_dataset(dataset_id)?;

// get downsampled minhashes for comparison.
let match_mh = match_sig.minhash().unwrap().clone();
let scaled = query.scaled();

// make downsampled minhashes
let max_scaled = max(match_mh.scaled(), query.scaled());

let match_mh = match_mh
.downsample_scaled(scaled)
.downsample_scaled(max_scaled)
.expect("cannot downsample match");

// repeatedly downsample query, then extract to KmerMinHash
// => calculate_gather_stats
query = query
.downsample_scaled(max_scaled)
.expect("cannot downsample query");
let query_mh = KmerMinHash::from(query.clone());

// just calculate essentials here
let gather_result_rank = matches.len();

let query_mh = KmerMinHash::from(query.clone());

// grab the specific intersection:
let isect = match_mh
.intersection(&query_mh)
.expect("failed to intersect");
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// Calculate stats
let gather_result = calculate_gather_stats(
let (gather_result, isect) = calculate_gather_stats(
&orig_query,
query_mh,
match_sig,
Expand All @@ -429,6 +428,12 @@ impl RevIndexOps for RevIndex {
ani_confidence_interval_fraction,
)
.expect("could not calculate gather stats");

// use intersection from calc_gather_stats to make a KmerMinHash.
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// keep track of the sum weighted found
sum_weighted_found = gather_result.sum_weighted_found();
matches.push(gather_result);
Expand Down
Loading