Skip to content

Commit

Permalink
Namespace refactoring:
Browse files Browse the repository at this point in the history
- proba_matrix moved to rappas::
- input/output classes and functions moved to rappas::io
  • Loading branch information
Nikolai Romashchenko committed May 13, 2019
1 parent d195431 commit 3085f73
Show file tree
Hide file tree
Showing 15 changed files with 375 additions and 265 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ endif()

# define a hash map
if(NOT HASH_MAP)
set(HASH_MAP "USE_TSL_ROBIN_MAP")
set(HASH_MAP "USE_SKA_BYTELL_HASH_MAP")
endif()

add_subdirectory(core)
Expand Down
3 changes: 2 additions & 1 deletion build/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
# WARNING: some dependencies are not listed here. See the top-level CMake file
# for more details
find_package(Boost REQUIRED COMPONENTS program_options filesystem iostreams)
find_package(Threads REQUIRED)
find_package(OpenMP REQUIRED)

######################################################################################################
# Application target and properties
Expand Down Expand Up @@ -40,6 +40,7 @@ target_link_libraries(build_n
Boost::program_options
Boost::filesystem
Boost::iostreams
OpenMP::OpenMP_CXX
strasser::csv_parser
rappas::core_n
rappas::utils
Expand Down
174 changes: 120 additions & 54 deletions build/src/build/db_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,45 @@
using std::string;
using std::cout, std::endl;
using std::to_string;
using core::phylo_kmer, core::phylo_kmer_db, core::phylo_tree;

class db_builder
namespace rappas
{
friend core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size);
public:
db_builder(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size);

void run();

private:
size_t explore_kmers(const core::phylo_tree& tree, const proba_matrix& probas);
size_t explore_branch(const branch_entry& probas, core::phylo_kmer::branch_type common_branch_label);

std::string _working_directory;
std::string _ar_probabilities_file;
std::string _tree_file;
std::string _extended_mapping_file;
std::string _artree_mapping_file;

size_t _kmer_size;
core::phylo_kmer_db _phylo_kmer_db;
extended_mapping _extended_mapping;
artree_label_mapping _artree_mapping;
};
class db_builder
{
friend phylo_kmer_db build(const string& working_directory, const string& ar_probabilities_file,
const string& tree_file, const string& extended_mapping_file,
const string& artree_mapping_file, size_t kmer_size);
public:
using branch_hash_map = core::hash_map<phylo_kmer::key_type, phylo_kmer::score_type>;

db_builder(const string& working_directory, const string& ar_probabilities_file,
const string& tree_file, const string& extended_mapping_file,
const string& artree_mapping_file, size_t kmer_size);

void run();

private:
size_t explore_kmers(const phylo_tree& tree, const proba_matrix& probas);
std::pair<branch_hash_map, size_t> explore_branch(const node_entry& probas);

string _working_directory;
string _ar_probabilities_file;
string _tree_file;
string _extended_mapping_file;
string _artree_mapping_file;

size_t _kmer_size;
phylo_kmer_db _phylo_kmer_db;
std::vector<branch_hash_map> _branch_maps;

extended_mapping _extended_mapping;
artree_label_mapping _artree_mapping;
};

}

using namespace rappas;

db_builder::db_builder(const string& working_directory, const string& ar_probabilities_file, const string& tree_file,
const string& extended_mapping_file, const string& artree_mapping_file, size_t kmer_size)
Expand All @@ -51,60 +62,112 @@ db_builder::db_builder(const string& working_directory, const string& ar_probabi
, _phylo_kmer_db{ kmer_size }
{}

size_t db_builder::explore_branch(const branch_entry& probas, core::phylo_kmer::branch_type original_id)
/// Puts a key-value pair in a hash map. Used to process branches in parallel
void put(db_builder::branch_hash_map& map, phylo_kmer::key_type key, phylo_kmer::score_type score)
{
if (auto it = map.find(key); it != map.end())
{
if (it->second < score)
{
map[key] = score;
}
}
else
{
map[key] = score;
}
}

std::pair<db_builder::branch_hash_map, size_t> db_builder::explore_branch(const node_entry& probas)
{
branch_hash_map hash_map;
size_t count = 0;
for (auto window = probas.begin(_kmer_size); window != probas.end(); ++window)
{
for (const auto& kmer : *window)
{
_phylo_kmer_db.put(kmer.key, original_id, kmer.score);
put(hash_map, kmer.key, kmer.score);
++count;
}
}
return count;
return { std::move(hash_map), count };
}

bool is_fake(const core::phylo_node& node)
bool is_ghost(const core::phylo_node& node)
{
const string& label = node.get_label();
return boost::ends_with(label, "_X0") || boost::ends_with(label, "_X1");
}

std::vector<std::string> get_ghost_ids(const core::phylo_tree& tree)
{
std::vector<std::string> branch_ids;

for (const auto& branch_node: tree)
{
if (is_ghost(branch_node))
{
branch_ids.push_back(branch_node.get_label());
}
}
return branch_ids;
}


size_t db_builder::explore_kmers(const core::phylo_tree& tree, const proba_matrix& probas)
{
size_t count = 0;

/// iterate over fake nodes
for (const auto& branch_node: tree)
/// Filter ghost nodes
const auto ghost_node_ids = get_ghost_ids(tree);
std::vector<phylo_kmer::branch_type> original_node_ids(ghost_node_ids.size());

/// Process branches in parallel. Results of the branch-and-bound algorithm are stored
/// in a hash map for every branch separately.
_branch_maps.resize(tree.get_node_count());
#pragma omp parallel for schedule(auto) // num_threads(num_threads)
for (size_t i = 0; i < ghost_node_ids.size(); ++i)
{
if (is_fake(branch_node))
const auto branch_node_label = ghost_node_ids[i];
original_node_ids[i] = _extended_mapping[branch_node_label];

/// Get submatrix of probabilities for a current branch node (if presented in proba matrix)
const auto phyml_node_label = _artree_mapping[branch_node_label];
if (const auto& it = probas.find(phyml_node_label); it != probas.end())
{
size_t branch_count;
std::tie(_branch_maps[i], branch_count) = explore_branch(it->second);
count += branch_count;
}
}

/// Merge hash maps in a final data structure
for (size_t i = 0; i < _branch_maps.size(); ++i)
{
auto& map = _branch_maps[i];
for (const auto& [key, score] : map)
{
const auto original_id = _extended_mapping[branch_node.get_label()];

/// get submatrix of probabilities for a current branch node (if presented in proba matrix)
const auto phyml_branch_label = _artree_mapping[branch_node.get_label()];
if (const auto& it = probas.find(phyml_branch_label); it != probas.end())
{
count += explore_branch(it->second, original_id);
}
_phylo_kmer_db.put(key, original_node_ids[i], score);
}
/// Replace a map with an empty one to free memory
map = {};
}

return count;
}

void db_builder::run()
{
_extended_mapping = load_extended_mapping(_extended_mapping_file);
_artree_mapping = load_artree_mapping(_artree_mapping_file);
_extended_mapping = rappas::io::load_extended_mapping(_extended_mapping_file);
_artree_mapping = rappas::io::load_artree_mapping(_artree_mapping_file);

const auto tree = core::load_newick(_tree_file);
const auto probas = load_phyml_probas(_ar_probabilities_file);
const auto tree = rappas::io::load_newick(_tree_file);
const auto proba_matrix = rappas::io::load_phyml_probas(_ar_probabilities_file);

/// Run the branch and bound algorithm
std::cout << "Building database..." << std::endl;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
const auto tuples_count = explore_kmers(tree, probas);
const auto tuples_count = explore_kmers(tree, proba_matrix);
auto end = std::chrono::steady_clock::now();

size_t total_entries = 0;
Expand All @@ -118,12 +181,15 @@ void db_builder::run()
<< "\n\n" << std::flush;
}

core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size)
namespace rappas
{
db_builder builder(working_directory, ar_probabilities_file, tree_file,
extended_mapping_file, artree_mapping_file, kmer_size);
builder.run();
return std::move(builder._phylo_kmer_db);
}
phylo_kmer_db build(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size)
{
db_builder builder(working_directory, ar_probabilities_file, tree_file,
extended_mapping_file, artree_mapping_file, kmer_size);
builder.run();
return std::move(builder._phylo_kmer_db);
}
}
9 changes: 6 additions & 3 deletions build/src/build/db_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ namespace core
class phylo_kmer_db;
}

core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size);
namespace rappas
{
core::phylo_kmer_db build(const std::string& working_directory, const std::string& ar_probabilities_file,
const std::string& tree_file, const std::string& extended_mapping_file,
const std::string& artree_mapping_file, size_t kmer_size);
}

#endif
9 changes: 5 additions & 4 deletions build/src/main.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include <iostream>
#include <boost/filesystem.hpp>
#include <chrono>
#include <core/phylo_kmer_db.h>
#include <core/serialization.h>
#include <chrono>
#include "cli/command_line.h"
#include "cli/exceptions.h"
#include "return.h"
Expand All @@ -29,11 +29,12 @@ return_code run(const cli::cli_parameters& parameters)
}
case cli::build:
{
const auto db = build_database(parameters.working_directory, parameters.ar_probabilities_file,
parameters.tree_file, parameters.extended_mapping_file, parameters.artree_mapping_file,
parameters.kmer_size);
const auto db = rappas::build(parameters.working_directory, parameters.ar_probabilities_file,
parameters.tree_file, parameters.extended_mapping_file,
parameters.artree_mapping_file, parameters.kmer_size);

const auto db_filename = fs::path(parameters.working_directory) / "DB.union";

std::cout << "Saving database to: " << db_filename.string() << "..." << std::endl;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
core::save(db, db_filename.string());
Expand Down
22 changes: 12 additions & 10 deletions build/src/pp_matrix/branch_entry.cpp
Original file line number Diff line number Diff line change
@@ -1,43 +1,45 @@
#include "branch_entry.h"

branch_entry::branch_entry(branch_id _id, vector_type&& rows)
using namespace rappas;

node_entry::node_entry(branch_type _id, vector_type&& rows)
: _branch_label{ _id }
, _rows{ std::move(rows) }
{}

branch_entry::const_iterator branch_entry::begin(uint32_t kmer_size) const
node_entry::const_iterator node_entry::begin(uint32_t kmer_size) const
{
return { { this, 0, kmer_size } };
}

branch_entry::const_iterator branch_entry::end() const
node_entry::const_iterator node_entry::end() const
{
return { { this, 0, 0 } };
}

void branch_entry::push_back(row&& r)
void node_entry::push_back(row_type&& row)
{
_rows.push_back(r);
_rows.push_back(row);
}

size_t branch_entry::get_alignment_size() const
size_t node_entry::get_alignment_size() const
{
return _rows.size();
}

branch_id branch_entry::get_branch_label() const
branch_type node_entry::get_label() const
{
return _branch_label;
}

const proba_pair& branch_entry::at(size_t position, size_t variant) const
const proba_pair& node_entry::at(size_t position, size_t variant) const
{
return _rows[position][variant];
}

bool operator==(const branch_entry& lhs, const branch_entry& rhs)
bool operator==(const node_entry& lhs, const node_entry& rhs)
{
return lhs.get_branch_label() == rhs.get_branch_label();
return lhs.get_label() == rhs.get_label();
}

view_iterator::view_iterator(branch_entry_view view) noexcept
Expand Down
Loading

0 comments on commit 3085f73

Please sign in to comment.