Skip to content

Commit

Permalink
Add bed mask invert option
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Aug 1, 2024
1 parent fec44b4 commit a9d1cbb
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 26 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" )
# These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out.
# Thus, do not replace the hashes manually!
SET( CLI11_COMMIT_HASH "5cb3efabce007c3a0230e4cc2e27da491c646b6c" ) #CLI11_COMMIT_HASH#
SET( genesis_COMMIT_HASH "8c419c47a9d6f13ead99df243aa2ba7f2b8683a7" ) #genesis_COMMIT_HASH#
SET( genesis_COMMIT_HASH "5197621925aecee869d2681d64582adea8a0348d" ) #genesis_COMMIT_HASH#

# Call the github download function, which takes four arguments:
# - LIBPATH : Path to the libracy dir where dependencies are stored.
Expand Down
2 changes: 1 addition & 1 deletion libs/genesis
125 changes: 109 additions & 16 deletions src/options/variant_filter_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,36 @@

void VariantFilterMaskOptions::add_mask_filter_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group
) {
add_mask_filter_sample_opts_to_app( sub, group );
add_mask_filter_total_opts_to_app( sub, group );
add_mask_filter_sample_opts_to_app( sub, ref_genome_opts, group );
add_mask_filter_total_opts_to_app( sub, ref_genome_opts, group );
}

// -------------------------------------------------------------------------------------------------
// Sample Masks
// -------------------------------------------------------------------------------------------------

void VariantFilterMaskOptions::add_mask_filter_sample_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group
) {
// Correct setup check.
internal_check(
filter_mask_sample_bed_list_.option == nullptr,
"Cannot use the same VariantFilterMaskOptions object multiple times."
"Cannot use the same VariantFilterMaskOptions object multiple times"
);
internal_check(
!ref_genome_opts_ || ref_genome_opts_ == &ref_genome_opts,
"Cannot use the same VariantFilterMaskOptions with different VariantReferenceGenomeOptions"
);
ref_genome_opts_ = &ref_genome_opts;

// -------------------------------------------
// BED
// -------------------------------------------

// Add option for genomic mask filter by BED file.
filter_mask_sample_bed_list_.option = sub->add_option(
Expand All @@ -84,6 +99,21 @@ void VariantFilterMaskOptions::add_mask_filter_sample_opts_to_app(
filter_mask_sample_bed_list_.option->check( CLI::ExistingFile );
filter_mask_sample_bed_list_.option->group( group );

// Add inversion option for above mask option.
filter_mask_sample_bed_inv_.option = sub->add_flag(
"--filter-mask-samples-bed-invert",
filter_mask_sample_bed_inv_.value,
"When using `--filter-mask-samples-bed-list`, set this flag to invert the specified mask. "
"Needs one of " + ref_genome_opts_->get_reference_option_names() +
" to determine chromosome lengths."
);
filter_mask_sample_bed_inv_.option->group( group );
filter_mask_sample_bed_inv_.option->needs( filter_mask_sample_bed_list_.option );

// -------------------------------------------
// FASTA
// -------------------------------------------

// Add option for genomic mask filter by fasta-style mask file.
filter_mask_sample_fasta_list_.option = sub->add_option(
"--filter-mask-samples-fasta-list",
Expand All @@ -105,18 +135,17 @@ void VariantFilterMaskOptions::add_mask_filter_sample_opts_to_app(
"digits. All positions above that value are masked. The default is 0, meaning that only "
"exactly the positons with value 0 will not be masked."
);
filter_mask_sample_fasta_min_.option->check(CLI::Range(0,9));
filter_mask_sample_fasta_min_.option->check( CLI::Range(0,9 ));
filter_mask_sample_fasta_min_.option->group( group );
filter_mask_sample_fasta_min_.option->needs( filter_mask_sample_fasta_list_.option );

// Add inversion option for above mask option.
filter_mask_sample_fasta_inv_.option = sub->add_flag(
"--filter-mask-samples-fasta-invert",
filter_mask_sample_fasta_inv_.value,
"When using `--filter-mask-samples-fasta-list`, invert the mask."
"When using `--filter-mask-samples-fasta-list`, invert the mask. "
"When this flag is set, the mask specified above is inverted."
);
filter_mask_sample_fasta_inv_.option->check(CLI::Range(0,9));
filter_mask_sample_fasta_inv_.option->group( group );
filter_mask_sample_fasta_inv_.option->needs( filter_mask_sample_fasta_list_.option );

Expand All @@ -125,15 +154,29 @@ void VariantFilterMaskOptions::add_mask_filter_sample_opts_to_app(
filter_mask_sample_fasta_list_.option->excludes( filter_mask_sample_bed_list_.option );
}

// -------------------------------------------------------------------------------------------------
// Total Masks
// -------------------------------------------------------------------------------------------------

void VariantFilterMaskOptions::add_mask_filter_total_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group
) {
// Correct setup check.
internal_check(
filter_mask_total_bed_.option == nullptr,
"Cannot use the same VariantFilterMaskOptions object multiple times."
"Cannot use the same VariantFilterMaskOptions object multiple times"
);
internal_check(
!ref_genome_opts_ || ref_genome_opts_ == &ref_genome_opts,
"Cannot use the same VariantFilterMaskOptions with different VariantReferenceGenomeOptions"
);
ref_genome_opts_ = &ref_genome_opts;

// -------------------------------------------
// BED
// -------------------------------------------

// Add option for genomic mask filter by BED file.
filter_mask_total_bed_.option = sub->add_option(
Expand All @@ -155,6 +198,21 @@ void VariantFilterMaskOptions::add_mask_filter_total_opts_to_app(
filter_mask_total_bed_.option->check( CLI::ExistingFile );
filter_mask_total_bed_.option->group( group );

// Add inversion option for above mask option.
filter_mask_total_bed_inv_.option = sub->add_flag(
"--filter-mask-total-bed-invert",
filter_mask_total_bed_inv_.value,
"When using `--filter-mask-total-bed`, set this flag to invert the specified mask. "
"Needs one of " + ref_genome_opts_->get_reference_option_names() +
" to determine chromosome lengths."
);
filter_mask_total_bed_inv_.option->group( group );
filter_mask_total_bed_inv_.option->needs( filter_mask_total_bed_.option );

// -------------------------------------------
// FASTA
// -------------------------------------------

// Add option for genomic mask filter by fasta-style mask file.
filter_mask_total_fasta_.option = sub->add_option(
"--filter-mask-total-fasta",
Expand All @@ -178,7 +236,7 @@ void VariantFilterMaskOptions::add_mask_filter_total_opts_to_app(
"All positions above that value are masked. The default is 0, meaning that only exactly "
"the positons with value 0 will not be masked."
);
filter_mask_total_fasta_min_.option->check(CLI::Range(0,9));
filter_mask_total_fasta_min_.option->check( CLI::Range(0,9) );
filter_mask_total_fasta_min_.option->group( group );
filter_mask_total_fasta_min_.option->needs( filter_mask_total_fasta_.option );

Expand All @@ -190,7 +248,6 @@ void VariantFilterMaskOptions::add_mask_filter_total_opts_to_app(
"as the equivalent in vcftools, but instead of specifying the file, this here is a flag. "
"When it is set, the mask specified above is inverted."
);
filter_mask_total_fasta_inv_.option->check(CLI::Range(0,9));
filter_mask_total_fasta_inv_.option->group( group );
filter_mask_total_fasta_inv_.option->needs( filter_mask_total_fasta_.option );

Expand All @@ -217,10 +274,9 @@ void VariantFilterMaskOptions::prepare_masks() const
// check_masks_against_reference
// -------------------------------------------------------------------------

void VariantFilterMaskOptions::check_masks_against_reference(
std::shared_ptr<genesis::sequence::SequenceDict> ref_dict
) const {
check_reference_and_masks_compatibility_( ref_dict );
void VariantFilterMaskOptions::check_masks_against_reference() const
{
check_reference_and_masks_compatibility_();
check_inter_masks_compatibility_();
}

Expand Down Expand Up @@ -303,7 +359,11 @@ void VariantFilterMaskOptions::prepare_sample_masks_() const
{
using namespace genesis;
using namespace genesis::population;
using namespace genesis::sequence;
using namespace genesis::utils;
internal_check(
ref_genome_opts_, "VariantFilterMaskOptions needs VariantReferenceGenomeOptions"
);

// Not running again if we already have set up a filter (i.e., if the shared pointer has data).
if( ! sample_masks_.empty() ) {
Expand Down Expand Up @@ -356,6 +416,18 @@ void VariantFilterMaskOptions::prepare_sample_masks_() const

// Add the masks from bed files.
if( *filter_mask_sample_bed_list_.option ) {
// We need a dict to invert
auto ref_dict = ref_genome_opts_->get_reference_dict();
if( filter_mask_sample_bed_inv_.value && !ref_dict ) {
throw CLI::ValidationError(
filter_mask_sample_bed_inv_.option->get_name(),
"Cannot invert BED mask without one of " +
ref_genome_opts_->get_reference_option_names() +
" being provided to determine the chromosome lengths"
);
}

// Read the files
LOG_MSG2 << "Reading sample masks BED files " << filter_mask_sample_bed_list_.value;
auto const sample_to_file = get_list_file_pairs_( filter_mask_sample_bed_list_ );
for( auto const& sample_file_pair : sample_to_file ) {
Expand All @@ -364,6 +436,9 @@ void VariantFilterMaskOptions::prepare_sample_masks_() const
sample_masks_[ sample_file_pair.first ] = std::make_shared<GenomeLocusSet>(
BedReader().read_as_genome_locus_set( from_file( sample_file_pair.second ))
);
if( filter_mask_sample_bed_inv_.value ) {
sample_masks_[ sample_file_pair.first ]->invert( *ref_dict );
}
}
}

Expand Down Expand Up @@ -393,7 +468,11 @@ void VariantFilterMaskOptions::prepare_total_mask_() const
{
using namespace genesis;
using namespace genesis::population;
using namespace genesis::sequence;
using namespace genesis::utils;
internal_check(
ref_genome_opts_, "VariantFilterMaskOptions needs VariantReferenceGenomeOptions"
);

// Not running again if we already have set up a filter (i.e., if the shared pointer has data).
if( total_mask_ ) {
Expand All @@ -406,6 +485,18 @@ void VariantFilterMaskOptions::prepare_total_mask_() const
total_mask_ = std::make_shared<GenomeLocusSet>(
BedReader().read_as_genome_locus_set( from_file( filter_mask_total_bed_.value ))
);
if( filter_mask_total_bed_inv_.value ) {
auto ref_dict = ref_genome_opts_->get_reference_dict();
if( !ref_dict ) {
throw CLI::ValidationError(
filter_mask_total_bed_inv_.option->get_name(),
"Cannot invert BED mask without one of " +
ref_genome_opts_->get_reference_option_names() +
" being provided to determine the chromosome lengths"
);
}
total_mask_->invert( *ref_dict );
}
}

// Add the mask from a fasta file.
Expand Down Expand Up @@ -483,12 +574,14 @@ void VariantFilterMaskOptions::check_sample_masks_name_list_(
// check_reference_and_masks_compatibility_
// -------------------------------------------------------------------------

void VariantFilterMaskOptions::check_reference_and_masks_compatibility_(
std::shared_ptr<genesis::sequence::SequenceDict> ref_dict
) const {
void VariantFilterMaskOptions::check_reference_and_masks_compatibility_() const {
using namespace genesis::sequence;

// See if there is a ref dict at all to compare to.
internal_check(
ref_genome_opts_, "VariantFilterMaskOptions needs VariantReferenceGenomeOptions"
);
auto ref_dict = ref_genome_opts_->get_reference_dict();
if( !ref_dict ) {
return;
}
Expand Down
17 changes: 11 additions & 6 deletions src/options/variant_filter_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include "CLI/CLI.hpp"

#include "options/variant_reference_genome.hpp"
#include "tools/cli_option.hpp"

#include "genesis/population/stream/variant_input_stream.hpp"
Expand Down Expand Up @@ -78,16 +79,19 @@ class VariantFilterMaskOptions

void add_mask_filter_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group = "Masking Filters"
);

void add_mask_filter_sample_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group = "Masking Filters"
);

void add_mask_filter_total_opts_to_app(
CLI::App* sub,
VariantReferenceGenomeOptions const& ref_genome_opts,
std::string const& group = "Masking Filters"
);

Expand Down Expand Up @@ -122,9 +126,7 @@ class VariantFilterMaskOptions
/**
* @brief Check that the mask fits with a given reference genome, if given.
*/
void check_masks_against_reference(
std::shared_ptr<genesis::sequence::SequenceDict> ref_dict
) const;
void check_masks_against_reference() const;

/**
* @brief Check that the provided per-sample masks and the input sample names
Expand Down Expand Up @@ -160,9 +162,7 @@ class VariantFilterMaskOptions
void check_sample_masks_name_list_(
std::vector<std::string> const& sample_names
) const;
void check_reference_and_masks_compatibility_(
std::shared_ptr<genesis::sequence::SequenceDict> ref_dict
) const;
void check_reference_and_masks_compatibility_() const;
void check_inter_masks_compatibility_() const;

// -------------------------------------------------------------------------
Expand All @@ -173,16 +173,21 @@ class VariantFilterMaskOptions

// Filters for each sample individually
CliOption<std::string> filter_mask_sample_bed_list_;
CliOption<bool> filter_mask_sample_bed_inv_ = false;
CliOption<std::string> filter_mask_sample_fasta_list_;
CliOption<size_t> filter_mask_sample_fasta_min_ = 0;
CliOption<bool> filter_mask_sample_fasta_inv_ = false;

// Filters for the whole variant
CliOption<std::string> filter_mask_total_bed_;
CliOption<bool> filter_mask_total_bed_inv_ = false;
CliOption<std::string> filter_mask_total_fasta_;
CliOption<size_t> filter_mask_total_fasta_min_ = 0;
CliOption<bool> filter_mask_total_fasta_inv_ = false;

// We need a reference dict, for verification, and for inverting bed files
VariantReferenceGenomeOptions const* ref_genome_opts_ = nullptr;

// We keep the masks here. Bits that are set here are masked!
mutable std::unordered_map<std::string, std::shared_ptr<GenomeLocusSet>> sample_masks_;
mutable std::shared_ptr<GenomeLocusSet> total_mask_;
Expand Down
4 changes: 2 additions & 2 deletions src/options/variant_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void VariantInputOptions::add_variant_input_opts_to_app(
region_filter_options_.add_region_filter_opts_to_app( sub );
}
if( suboptions.with_mask_filter_opts ) {
mask_filter_options_.add_mask_filter_opts_to_app( sub );
mask_filter_options_.add_mask_filter_opts_to_app( sub, reference_genome_options_ );
}
}

Expand Down Expand Up @@ -300,7 +300,7 @@ void VariantInputOptions::prepare_inputs_() const
// as that would only make sense for the fasta mask type region filter, which is kind of an
// outlyer anyway.
// We do test the reference genome and mask files here though, if both are given.
mask_filter_options_.check_masks_against_reference( get_reference_dict() );
mask_filter_options_.check_masks_against_reference();
}

// -------------------------------------------------------------------------
Expand Down
14 changes: 14 additions & 0 deletions src/options/variant_reference_genome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ void VariantReferenceGenomeOptions::add_reference_genome_opts_to_app(
reference_genome_fai_file_.option->excludes( reference_genome_dict_file_.option );
}

std::string VariantReferenceGenomeOptions::get_reference_option_names() const
{
if( !reference_genome_fasta_file_.option ) {
throw std::domain_error(
"Call to VariantReferenceGenomeOptions::get_reference_option_names() before being set up"
);
}

auto const n0 = reference_genome_fasta_file_.option->get_name();
auto const n1 = reference_genome_dict_file_.option->get_name();
auto const n2 = reference_genome_fai_file_.option->get_name();
return "`" + n0 + "`, `" + n1 + "`, `" + n2 + "`";
}

// =================================================================================================
// Run Functions
// =================================================================================================
Expand Down
5 changes: 5 additions & 0 deletions src/options/variant_reference_genome.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ class VariantReferenceGenomeOptions
std::string const& group = "Input Settings"
);

/**
* @brief Helper function to get a string of the option names.
*/
std::string get_reference_option_names() const;

// -------------------------------------------------------------------------
// Run Functions
// -------------------------------------------------------------------------
Expand Down

0 comments on commit a9d1cbb

Please sign in to comment.