diff --git a/CMakeLists.txt b/CMakeLists.txt index 5029a6d..8d19e79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ endif() # define a hash map if(NOT HASH_MAP) - set(HASH_MAP "USE_TSL_ROBIN_MAP") + set(HASH_MAP "USE_SKA_BYTELL_HASH_MAP") endif() add_subdirectory(core) diff --git a/build/CMakeLists.txt b/build/CMakeLists.txt index c9ee811..ea7a123 100644 --- a/build/CMakeLists.txt +++ b/build/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.10 FATAL_ERROR) # WARNING: some dependencies are not listed here. See the top-level CMake file # for more details find_package(Boost REQUIRED COMPONENTS program_options filesystem iostreams) -find_package(Threads REQUIRED) +find_package(OpenMP REQUIRED) ###################################################################################################### # Application target and properties @@ -40,6 +40,7 @@ target_link_libraries(build_n Boost::program_options Boost::filesystem Boost::iostreams + OpenMP::OpenMP_CXX strasser::csv_parser rappas::core_n rappas::utils diff --git a/build/src/build/db_builder.cpp b/build/src/build/db_builder.cpp index 8d1790c..d1a5227 100644 --- a/build/src/build/db_builder.cpp +++ b/build/src/build/db_builder.cpp @@ -11,34 +11,45 @@ using std::string; using std::cout, std::endl; using std::to_string; +using core::phylo_kmer, core::phylo_kmer_db, core::phylo_tree; -class db_builder +namespace rappas { - friend core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file, - const std::string& tree_file, const std::string& extended_mapping_file, - const std::string& artree_mapping_file, size_t kmer_size); -public: - db_builder(const std::string& working_directory, const std::string& ar_probabilities_file, - const std::string& tree_file, const std::string& extended_mapping_file, - const std::string& artree_mapping_file, size_t kmer_size); - - void run(); - -private: - size_t explore_kmers(const core::phylo_tree& tree, const proba_matrix& probas); - size_t explore_branch(const branch_entry& probas, core::phylo_kmer::branch_type common_branch_label); - - std::string _working_directory; - std::string _ar_probabilities_file; - std::string _tree_file; - std::string _extended_mapping_file; - std::string _artree_mapping_file; - - size_t _kmer_size; - core::phylo_kmer_db _phylo_kmer_db; - extended_mapping _extended_mapping; - artree_label_mapping _artree_mapping; -}; + class db_builder + { + friend phylo_kmer_db build(const string& working_directory, const string& ar_probabilities_file, + const string& tree_file, const string& extended_mapping_file, + const string& artree_mapping_file, size_t kmer_size); + public: + using branch_hash_map = core::hash_map; + + db_builder(const string& working_directory, const string& ar_probabilities_file, + const string& tree_file, const string& extended_mapping_file, + const string& artree_mapping_file, size_t kmer_size); + + void run(); + + private: + size_t explore_kmers(const phylo_tree& tree, const proba_matrix& probas); + std::pair explore_branch(const node_entry& probas); + + string _working_directory; + string _ar_probabilities_file; + string _tree_file; + string _extended_mapping_file; + string _artree_mapping_file; + + size_t _kmer_size; + phylo_kmer_db _phylo_kmer_db; + std::vector _branch_maps; + + extended_mapping _extended_mapping; + artree_label_mapping _artree_mapping; + }; + +} + +using namespace rappas; db_builder::db_builder(const string& working_directory, const string& ar_probabilities_file, const string& tree_file, const string& extended_mapping_file, const string& artree_mapping_file, size_t kmer_size) @@ -51,60 +62,112 @@ db_builder::db_builder(const string& working_directory, const string& ar_probabi , _phylo_kmer_db{ kmer_size } {} -size_t db_builder::explore_branch(const branch_entry& probas, core::phylo_kmer::branch_type original_id) +/// Puts a key-value pair in a hash map. Used to process branches in parallel +void put(db_builder::branch_hash_map& map, phylo_kmer::key_type key, phylo_kmer::score_type score) { + if (auto it = map.find(key); it != map.end()) + { + if (it->second < score) + { + map[key] = score; + } + } + else + { + map[key] = score; + } +} + +std::pair db_builder::explore_branch(const node_entry& probas) +{ + branch_hash_map hash_map; size_t count = 0; for (auto window = probas.begin(_kmer_size); window != probas.end(); ++window) { for (const auto& kmer : *window) { - _phylo_kmer_db.put(kmer.key, original_id, kmer.score); + put(hash_map, kmer.key, kmer.score); ++count; } } - return count; + return { std::move(hash_map), count }; } -bool is_fake(const core::phylo_node& node) +bool is_ghost(const core::phylo_node& node) { const string& label = node.get_label(); return boost::ends_with(label, "_X0") || boost::ends_with(label, "_X1"); } +std::vector get_ghost_ids(const core::phylo_tree& tree) +{ + std::vector branch_ids; + + for (const auto& branch_node: tree) + { + if (is_ghost(branch_node)) + { + branch_ids.push_back(branch_node.get_label()); + } + } + return branch_ids; +} + + size_t db_builder::explore_kmers(const core::phylo_tree& tree, const proba_matrix& probas) { size_t count = 0; - /// iterate over fake nodes - for (const auto& branch_node: tree) + /// Filter ghost nodes + const auto ghost_node_ids = get_ghost_ids(tree); + std::vector original_node_ids(ghost_node_ids.size()); + + /// Process branches in parallel. Results of the branch-and-bound algorithm are stored + /// in a hash map for every branch separately. + _branch_maps.resize(tree.get_node_count()); + #pragma omp parallel for schedule(auto) // num_threads(num_threads) + for (size_t i = 0; i < ghost_node_ids.size(); ++i) { - if (is_fake(branch_node)) + const auto branch_node_label = ghost_node_ids[i]; + original_node_ids[i] = _extended_mapping[branch_node_label]; + + /// Get submatrix of probabilities for a current branch node (if presented in proba matrix) + const auto phyml_node_label = _artree_mapping[branch_node_label]; + if (const auto& it = probas.find(phyml_node_label); it != probas.end()) + { + size_t branch_count; + std::tie(_branch_maps[i], branch_count) = explore_branch(it->second); + count += branch_count; + } + } + + /// Merge hash maps in a final data structure + for (size_t i = 0; i < _branch_maps.size(); ++i) + { + auto& map = _branch_maps[i]; + for (const auto& [key, score] : map) { - const auto original_id = _extended_mapping[branch_node.get_label()]; - - /// get submatrix of probabilities for a current branch node (if presented in proba matrix) - const auto phyml_branch_label = _artree_mapping[branch_node.get_label()]; - if (const auto& it = probas.find(phyml_branch_label); it != probas.end()) - { - count += explore_branch(it->second, original_id); - } + _phylo_kmer_db.put(key, original_node_ids[i], score); } + /// Replace a map with an empty one to free memory + map = {}; } + return count; } void db_builder::run() { - _extended_mapping = load_extended_mapping(_extended_mapping_file); - _artree_mapping = load_artree_mapping(_artree_mapping_file); + _extended_mapping = rappas::io::load_extended_mapping(_extended_mapping_file); + _artree_mapping = rappas::io::load_artree_mapping(_artree_mapping_file); - const auto tree = core::load_newick(_tree_file); - const auto probas = load_phyml_probas(_ar_probabilities_file); + const auto tree = rappas::io::load_newick(_tree_file); + const auto proba_matrix = rappas::io::load_phyml_probas(_ar_probabilities_file); /// Run the branch and bound algorithm std::cout << "Building database..." << std::endl; std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); - const auto tuples_count = explore_kmers(tree, probas); + const auto tuples_count = explore_kmers(tree, proba_matrix); auto end = std::chrono::steady_clock::now(); size_t total_entries = 0; @@ -118,12 +181,15 @@ void db_builder::run() << "\n\n" << std::flush; } -core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file, - const std::string& tree_file, const std::string& extended_mapping_file, - const std::string& artree_mapping_file, size_t kmer_size) +namespace rappas { - db_builder builder(working_directory, ar_probabilities_file, tree_file, - extended_mapping_file, artree_mapping_file, kmer_size); - builder.run(); - return std::move(builder._phylo_kmer_db); -} \ No newline at end of file + phylo_kmer_db build(const std::string& working_directory, const std::string& ar_probabilities_file, + const std::string& tree_file, const std::string& extended_mapping_file, + const std::string& artree_mapping_file, size_t kmer_size) + { + db_builder builder(working_directory, ar_probabilities_file, tree_file, + extended_mapping_file, artree_mapping_file, kmer_size); + builder.run(); + return std::move(builder._phylo_kmer_db); + } +} diff --git a/build/src/build/db_builder.h b/build/src/build/db_builder.h index b3c9799..4daf4b7 100644 --- a/build/src/build/db_builder.h +++ b/build/src/build/db_builder.h @@ -8,8 +8,11 @@ namespace core class phylo_kmer_db; } -core::phylo_kmer_db build_database(const std::string& working_directory, const std::string& ar_probabilities_file, - const std::string& tree_file, const std::string& extended_mapping_file, - const std::string& artree_mapping_file, size_t kmer_size); +namespace rappas +{ + core::phylo_kmer_db build(const std::string& working_directory, const std::string& ar_probabilities_file, + const std::string& tree_file, const std::string& extended_mapping_file, + const std::string& artree_mapping_file, size_t kmer_size); +} #endif \ No newline at end of file diff --git a/build/src/main.cpp b/build/src/main.cpp index b1ffceb..952ba43 100644 --- a/build/src/main.cpp +++ b/build/src/main.cpp @@ -1,8 +1,8 @@ #include #include +#include #include #include -#include #include "cli/command_line.h" #include "cli/exceptions.h" #include "return.h" @@ -29,11 +29,12 @@ return_code run(const cli::cli_parameters& parameters) } case cli::build: { - const auto db = build_database(parameters.working_directory, parameters.ar_probabilities_file, - parameters.tree_file, parameters.extended_mapping_file, parameters.artree_mapping_file, - parameters.kmer_size); + const auto db = rappas::build(parameters.working_directory, parameters.ar_probabilities_file, + parameters.tree_file, parameters.extended_mapping_file, + parameters.artree_mapping_file, parameters.kmer_size); const auto db_filename = fs::path(parameters.working_directory) / "DB.union"; + std::cout << "Saving database to: " << db_filename.string() << "..." << std::endl; std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); core::save(db, db_filename.string()); diff --git a/build/src/pp_matrix/branch_entry.cpp b/build/src/pp_matrix/branch_entry.cpp index 8474991..b0e1004 100644 --- a/build/src/pp_matrix/branch_entry.cpp +++ b/build/src/pp_matrix/branch_entry.cpp @@ -1,43 +1,45 @@ #include "branch_entry.h" -branch_entry::branch_entry(branch_id _id, vector_type&& rows) +using namespace rappas; + +node_entry::node_entry(branch_type _id, vector_type&& rows) : _branch_label{ _id } , _rows{ std::move(rows) } {} -branch_entry::const_iterator branch_entry::begin(uint32_t kmer_size) const +node_entry::const_iterator node_entry::begin(uint32_t kmer_size) const { return { { this, 0, kmer_size } }; } -branch_entry::const_iterator branch_entry::end() const +node_entry::const_iterator node_entry::end() const { return { { this, 0, 0 } }; } -void branch_entry::push_back(row&& r) +void node_entry::push_back(row_type&& row) { - _rows.push_back(r); + _rows.push_back(row); } -size_t branch_entry::get_alignment_size() const +size_t node_entry::get_alignment_size() const { return _rows.size(); } -branch_id branch_entry::get_branch_label() const +branch_type node_entry::get_label() const { return _branch_label; } -const proba_pair& branch_entry::at(size_t position, size_t variant) const +const proba_pair& node_entry::at(size_t position, size_t variant) const { return _rows[position][variant]; } -bool operator==(const branch_entry& lhs, const branch_entry& rhs) +bool operator==(const node_entry& lhs, const node_entry& rhs) { - return lhs.get_branch_label() == rhs.get_branch_label(); + return lhs.get_label() == rhs.get_label(); } view_iterator::view_iterator(branch_entry_view view) noexcept diff --git a/build/src/pp_matrix/branch_entry.h b/build/src/pp_matrix/branch_entry.h index 14e16b6..3e5618d 100644 --- a/build/src/pp_matrix/branch_entry.h +++ b/build/src/pp_matrix/branch_entry.h @@ -7,36 +7,36 @@ class view_iterator; /// \brief A submatrix of posterior probabilities matrix (fixed branch, all the positions of input alignment) -class branch_entry final +class node_entry final { public: using const_iterator = view_iterator; - using vector_type = std::vector; + using vector_type = std::vector; - explicit branch_entry() noexcept = default; - branch_entry(branch_id _id, vector_type&& rows); - branch_entry(const branch_entry&) = delete; - branch_entry(branch_entry&&) = default; - branch_entry& operator=(const branch_entry&) = delete; - branch_entry& operator=(branch_entry&&) = default; - ~branch_entry() noexcept = default; + explicit node_entry() noexcept = default; + node_entry(rappas::branch_type _id, vector_type&& rows); + node_entry(const node_entry&) = delete; + node_entry(node_entry&&) = default; + node_entry& operator=(const node_entry&) = delete; + node_entry& operator=(node_entry&&) = default; + ~node_entry() noexcept = default; const_iterator begin(uint32_t kmer_size) const; const_iterator end() const; - void push_back(row&& r); + void push_back(rappas::row_type&& row); size_t get_alignment_size() const; - branch_id get_branch_label() const; + rappas::branch_type get_label() const; - const proba_pair& at(size_t position, size_t variant) const; + const rappas::proba_pair& at(size_t position, size_t variant) const; private: - branch_id _branch_label; + rappas::branch_type _branch_label; vector_type _rows; }; -bool operator==(const branch_entry& lhs, const branch_entry& rhs); +bool operator==(const node_entry& lhs, const node_entry& rhs); class view_iterator { diff --git a/build/src/pp_matrix/branch_entry_view.cpp b/build/src/pp_matrix/branch_entry_view.cpp index 67dd7db..4db6fba 100644 --- a/build/src/pp_matrix/branch_entry_view.cpp +++ b/build/src/pp_matrix/branch_entry_view.cpp @@ -3,7 +3,7 @@ #include #include -phylo_kmer_iterator::phylo_kmer_iterator(const branch_entry* entry, size_t kmer_size, +phylo_kmer_iterator::phylo_kmer_iterator(const node_entry* entry, size_t kmer_size, core::phylo_kmer::pos_type start_pos, stack_type stack) noexcept : _entry{ entry } , _kmer_size{ kmer_size } @@ -127,7 +127,7 @@ phylo_kmer_iterator::phylo_mmer phylo_kmer_iterator::next_phylokmer() return {}; } -branch_entry_view::branch_entry_view(const branch_entry* entry, core::phylo_kmer::pos_type start, core::phylo_kmer::pos_type end) noexcept +branch_entry_view::branch_entry_view(const node_entry* entry, core::phylo_kmer::pos_type start, core::phylo_kmer::pos_type end) noexcept : _entry{ entry } , _start{ start } , _end{ end } @@ -166,7 +166,7 @@ branch_entry_view& branch_entry_view::operator=(branch_entry_view&& other) noexc return *this; } -const branch_entry* branch_entry_view::get_entry() const +const node_entry* branch_entry_view::get_entry() const { return _entry; } diff --git a/build/src/pp_matrix/branch_entry_view.h b/build/src/pp_matrix/branch_entry_view.h index 34b19b3..1517ed9 100644 --- a/build/src/pp_matrix/branch_entry_view.h +++ b/build/src/pp_matrix/branch_entry_view.h @@ -4,7 +4,7 @@ #include #include "row.h" -class branch_entry; +class node_entry; /// \brief A forward access const iterator for phylo_kmer pairs [kmer value, score]. Iterates over /// a fixed branch_entry_view of size K. @@ -25,7 +25,7 @@ class phylo_kmer_iterator using pointer = const core::phylo_kmer*; using stack_type = boost::container::static_vector; - phylo_kmer_iterator(const branch_entry* entry, size_t kmer_size, + phylo_kmer_iterator(const node_entry* entry, size_t kmer_size, core::phylo_kmer::pos_type start_pos, stack_type stack) noexcept; phylo_kmer_iterator(const phylo_kmer_iterator&) = delete; phylo_kmer_iterator(phylo_kmer_iterator&&) = default; @@ -45,7 +45,7 @@ class phylo_kmer_iterator void next_position(); phylo_mmer next_phylokmer(); - const branch_entry* _entry; + const node_entry* _entry; size_t _kmer_size; const core::phylo_kmer::pos_type _start_pos; const core::phylo_kmer::score_type _threshold; @@ -60,7 +60,7 @@ class branch_entry_view final using const_iterator = phylo_kmer_iterator; using const_reference = const_iterator::reference; - branch_entry_view(const branch_entry* entry, core::phylo_kmer::pos_type start, core::phylo_kmer::pos_type end) noexcept; + branch_entry_view(const node_entry* entry, core::phylo_kmer::pos_type start, core::phylo_kmer::pos_type end) noexcept; branch_entry_view(const branch_entry_view& other) noexcept; branch_entry_view(branch_entry_view&&) = delete; branch_entry_view& operator=(const branch_entry_view&) = delete; @@ -70,12 +70,12 @@ class branch_entry_view final const_iterator begin() const; const_iterator end() const; - const branch_entry* get_entry() const; + const node_entry* get_entry() const; core::phylo_kmer::pos_type get_start_pos() const; core::phylo_kmer::pos_type get_end_pos() const; private: - const branch_entry* _entry; + const node_entry* _entry; core::phylo_kmer::pos_type _start; core::phylo_kmer::pos_type _end; }; diff --git a/build/src/pp_matrix/phyml.cpp b/build/src/pp_matrix/phyml.cpp index 0f4bc2a..d396afe 100644 --- a/build/src/pp_matrix/phyml.cpp +++ b/build/src/pp_matrix/phyml.cpp @@ -8,132 +8,159 @@ using std::string; using std::cout, std::endl; -class phyml_output_reader +/// \brief Returns if the input string is a number +bool is_number(const std::string& s) { -public: - phyml_output_reader(const string& file_name) - : _file_name(file_name) - {} - - phyml_output_reader(const phyml_output_reader&) = delete; - ~phyml_output_reader() = default; - - proba_matrix read() + auto it = s.begin(); + while (it != s.end() && std::isdigit(*it)) { - try - { - cout << "Loading PhyML results: " + _file_name << "..." << endl; - std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); - auto matrix = read_matrix(); - - cout << "Loaded " << matrix.num_branches() << " matrices of " << - matrix.num_sites() << " rows." << endl; - cout << "Time (ms): " << std::chrono::duration_cast( - std::chrono::steady_clock::now() - begin).count() << endl << endl; - return matrix; - } - catch (io::error::integer_overflow& error) - { - throw std::runtime_error("PhyML result parsing error: " + string(error.what())); - } + ++it; } + return !s.empty() && it == s.end(); +} + -private: - proba_matrix read_matrix() +namespace rappas +{ + namespace io { - proba_matrix matrix; - - io::CSVReader<5, - io::trim_chars<' '>, - io::no_quote_escape<'\t'>, - io::throw_on_overflow, - io::single_and_empty_line_comment<'.'>> _in(_file_name); - _in.read_header(io::ignore_extra_column, "NodeLabel", "A", "C", "G", "T"); - - auto node_label = proba_matrix::NOT_A_LABEL; - core::phylo_kmer::score_type a, c, g, t; - while (_in.read_row(node_label, a, c, g, t)) + /// \brief Reads a PhyML output into a matrix. + class phyml_output_reader { - if (node_label == proba_matrix::NOT_A_LABEL) - { - throw std::runtime_error("Node label value is too big: " + std::to_string(node_label)); - } + public: + phyml_output_reader(const string& file_name) noexcept; + phyml_output_reader(const phyml_output_reader&) = delete; + phyml_output_reader(phyml_output_reader&&) = delete; + phyml_output_reader& operator=(const phyml_output_reader&) = delete; + phyml_output_reader& operator=(phyml_output_reader&&) = delete; + ~phyml_output_reader() noexcept = default; - /// log-transform the probabilities - auto new_row = row { { { a, 0 }, { c, 1 }, { g, 2 }, { t, 3 } } }; - auto log = [](const proba_pair& p) { return proba_pair{ std::log10(p.score), p.index }; }; - std::transform(begin(new_row), end(new_row), begin(new_row), log); + rappas::proba_matrix read(); - // sort them - auto compare = [](const proba_pair& p1, const proba_pair& p2) { return p1.score > p2.score; }; - std::sort(begin(new_row), end(new_row), compare); + private: + rappas::proba_matrix read_matrix(); - /// insert - auto it = matrix.find(node_label); - if (it != std::end(matrix)) - { - it->second.push_back(std::move(new_row)); - } - else - { - matrix[node_label] = proba_matrix::mapped_type{node_label, {std::move(new_row)} }; - } - } - return matrix; + string _file_name; + }; } - -private: - string _file_name; -}; - -proba_matrix load_phyml_probas(const std::string& file_name) -{ - phyml_output_reader reader(file_name); - return reader.read(); } -extended_mapping load_extended_mapping(const std::string& file_name) +rappas::io::phyml_output_reader::phyml_output_reader(const string& file_name) noexcept + : _file_name{ file_name } +{} + +rappas::proba_matrix rappas::io::phyml_output_reader::read() { - cout << "Loading a node mapping: " + file_name << endl; - artree_label_mapping mapping; - - io::CSVReader<2, io::trim_chars<' '>, io::no_quote_escape<'\t'>> in(file_name); - in.read_header(io::ignore_extra_column, "original_id", "extended_name"); - std::string extended_name; - branch_id original_id; - while (in.read_row(original_id, extended_name)) + try + { + cout << "Loading PhyML results: " + _file_name << "..." << endl; + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); + auto matrix = read_matrix(); + + cout << "Loaded " << matrix.num_branches() << " matrices of " << + matrix.num_sites() << " rows." << endl; + cout << "Time (ms): " << std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin).count() << endl << endl; + return matrix; + } + catch (::io::error::integer_overflow& error) { - mapping[extended_name] = original_id; + throw std::runtime_error("PhyML result parsing error: " + string(error.what())); } - cout << "Loaded " << mapping.size() << " mapped ids." << endl << endl; - return mapping; } -bool is_number(const std::string& s) +rappas::proba_matrix rappas::io::phyml_output_reader::read_matrix() { - auto it = s.begin(); - while (it != s.end() && std::isdigit(*it)) + proba_matrix matrix; + + ::io::CSVReader<5, + ::io::trim_chars<' '>, + ::io::no_quote_escape<'\t'>, + ::io::throw_on_overflow, + ::io::single_and_empty_line_comment<'.'>> _in(_file_name); + _in.read_header(::io::ignore_extra_column, "NodeLabel", "A", "C", "G", "T"); + + auto node_label = proba_matrix::NOT_A_LABEL; + core::phylo_kmer::score_type a, c, g, t; + while (_in.read_row(node_label, a, c, g, t)) { - ++it; + if (node_label == proba_matrix::NOT_A_LABEL) + { + throw std::runtime_error("Node label value is too big: " + std::to_string(node_label)); + } + + /// log-transform the probabilities + auto new_row = row_type { { { a, 0 }, { c, 1 }, { g, 2 }, { t, 3 } } }; + auto log = [](const proba_pair& p) { return proba_pair{ std::log10(p.score), p.index }; }; + std::transform(begin(new_row), end(new_row), begin(new_row), log); + + // sort them + auto compare = [](const proba_pair& p1, const proba_pair& p2) { return p1.score > p2.score; }; + std::sort(begin(new_row), end(new_row), compare); + + /// insert + auto it = matrix.find(node_label); + if (it != std::end(matrix)) + { + it->second.push_back(std::move(new_row)); + } + else + { + /// WARNING: it is cheap to copy new_row here in the case of DNA. + /// It is probably makes sense to move it in the case of amino acids though. + matrix[node_label] = proba_matrix::mapped_type{node_label, { new_row } }; + } } - return !s.empty() && it == s.end(); + return matrix; } -artree_label_mapping load_artree_mapping(const std::string& file_name) -{ - cout << "Loading a node mapping: " + file_name << endl; - artree_label_mapping mapping; - io::CSVReader<2, io::trim_chars<' '>, io::no_quote_escape<'\t'>> in(file_name); - in.read_header(io::ignore_extra_column, "extended_label", "ARtree_label"); - std::string extended_label, artree_label; - while (in.read_row(extended_label, artree_label)) +namespace rappas +{ + namespace io { - if (is_number(artree_label)) + rappas::proba_matrix load_phyml_probas(const std::string& file_name) + { + phyml_output_reader reader(file_name); + return reader.read(); + } + + /// \brief Reads a "extended_tree_node_mapping.tsv" file produced by the old RAPPAS. + extended_mapping load_extended_mapping(const std::string& file_name) + { + cout << "Loading a node mapping: " + file_name << endl; + artree_label_mapping mapping; + + ::io::CSVReader<2, ::io::trim_chars<' '>, ::io::no_quote_escape<'\t'>> in(file_name); + in.read_header(::io::ignore_extra_column, "original_id", "extended_name"); + std::string extended_name; + branch_type original_id; + while (in.read_row(original_id, extended_name)) + { + mapping[extended_name] = original_id; + } + cout << "Loaded " << mapping.size() << " mapped ids." << endl << endl; + return mapping; + } + + /// \brief Reads a "ARtree_id_mapping.tsv" file produced by the old RAPPAS. + artree_label_mapping load_artree_mapping(const std::string& file_name) { - mapping[extended_label] = core::phylo_kmer::branch_type(std::stoul(artree_label)); + cout << "Loading a node mapping: " + file_name << endl; + artree_label_mapping mapping; + + ::io::CSVReader<2, ::io::trim_chars<' '>, ::io::no_quote_escape<'\t'>> in(file_name); + in.read_header(::io::ignore_extra_column, "extended_label", "ARtree_label"); + std::string extended_label, artree_label; + while (in.read_row(extended_label, artree_label)) + { + if (is_number(artree_label)) + { + mapping[extended_label] = core::phylo_kmer::branch_type(std::stoul(artree_label)); + } + } + cout << "Loaded " << mapping.size() << " mapped ids." << endl << endl; + return mapping; } } - cout << "Loaded " << mapping.size() << " mapped ids." << endl << endl; - return mapping; } \ No newline at end of file diff --git a/build/src/pp_matrix/phyml.h b/build/src/pp_matrix/phyml.h index 661b412..72f30fc 100644 --- a/build/src/pp_matrix/phyml.h +++ b/build/src/pp_matrix/phyml.h @@ -7,15 +7,24 @@ class proba_matrix; -/// Load a posterior probability matrix from a file, generated by PhyML -proba_matrix load_phyml_probas(const std::string& file_name); +namespace rappas +{ + class proba_matrix; -using artree_label_mapping = std::unordered_map; -using extended_mapping = std::unordered_map; + using artree_label_mapping = std::unordered_map; + using extended_mapping = std::unordered_map; + + namespace io + { + /// Load a posterior probability matrix from a file, generated by PhyML + rappas::proba_matrix load_phyml_probas(const std::string& file_name); + + /// Load a node mapping from file + extended_mapping load_extended_mapping(const std::string& file_name); + artree_label_mapping load_artree_mapping(const std::string& file_name); + } +} -/// Load a node mapping from file -extended_mapping load_extended_mapping(const std::string& file_name); -artree_label_mapping load_artree_mapping(const std::string& file_name); #endif \ No newline at end of file diff --git a/build/src/pp_matrix/proba_matrix.cpp b/build/src/pp_matrix/proba_matrix.cpp index b235876..2a569d5 100644 --- a/build/src/pp_matrix/proba_matrix.cpp +++ b/build/src/pp_matrix/proba_matrix.cpp @@ -1,6 +1,8 @@ #include "proba_matrix.h" #include +using namespace rappas; + size_t proba_matrix::num_branches() const { return _data.size(); @@ -11,22 +13,22 @@ size_t proba_matrix::num_sites() const return std::begin(_data)->second.get_alignment_size(); } -proba_matrix::mapped_type& proba_matrix::operator[](branch_id id) +proba_matrix::mapped_type& proba_matrix::operator[](branch_type id) { return _data[id]; } -const proba_matrix::mapped_type& proba_matrix::at(branch_id id) const +const proba_matrix::mapped_type& proba_matrix::at(branch_type id) const { return _data.at(id); } -proba_matrix::iterator proba_matrix::find(const branch_id& id) +proba_matrix::iterator proba_matrix::find(const branch_type& id) { return _data.find(id); } -proba_matrix::const_iterator proba_matrix::find(const branch_id& id) const +proba_matrix::const_iterator proba_matrix::find(const branch_type& id) const { return _data.find(id); } diff --git a/build/src/pp_matrix/proba_matrix.h b/build/src/pp_matrix/proba_matrix.h index 08063d2..22c97f2 100644 --- a/build/src/pp_matrix/proba_matrix.h +++ b/build/src/pp_matrix/proba_matrix.h @@ -4,48 +4,52 @@ #include #include "branch_entry.h" -/// \brief A posterior probabilities matrix class. -/// \details A matrix class for storing posterior probabilities, given by the ancestral reconstruction -/// algorithm. So, this is a matrix of size [#branch_nodes x #sites x #variants], where: -/// - #branch_nodes is the number of non-leaf nodes of input tree -/// - #sites is the size of input alignment, -/// - #variants is the alphabet size. -class proba_matrix final +namespace rappas { -public: - static const branch_id NOT_A_LABEL = std::numeric_limits::max(); - - /// a map for a fast access to a submatrix by branch node label - using storage = std::unordered_map; - using iterator = typename storage::iterator; - using const_iterator = typename storage::const_iterator; - using mapped_type = storage::mapped_type; - - proba_matrix() = default; - proba_matrix(const proba_matrix&) = delete; - proba_matrix(proba_matrix&& other) = default; - proba_matrix& operator=(const proba_matrix&) = delete; - proba_matrix& operator=(proba_matrix&&) = delete; - ~proba_matrix() = default; - - /// capacity - size_t num_branches() const; - size_t num_sites() const; - - // Lookup - mapped_type& operator[](branch_id id); - const mapped_type& at(branch_id id) const; - iterator find(const branch_id& id); - const_iterator find(const branch_id& id) const; - - /// iterators - iterator begin(); - iterator end(); - const_iterator begin() const; - const_iterator end() const; - -private: - storage _data; -}; + + /// \brief A posterior probabilities matrix class. + /// \details A matrix class for storing posterior probabilities, given by the ancestral reconstruction + /// algorithm. So, this is a matrix of size [#branch_nodes x #sites x #variants], where: + /// - #branch_nodes is the number of non-leaf nodes of input tree + /// - #sites is the size of input alignment, + /// - #variants is the alphabet size. + class proba_matrix final + { + public: + static const branch_type NOT_A_LABEL = std::numeric_limits::max(); + + /// a map for a fast access to a submatrix by branch node label + using storage = std::unordered_map; + using iterator = typename storage::iterator; + using const_iterator = typename storage::const_iterator; + using mapped_type = storage::mapped_type; + + proba_matrix() = default; + proba_matrix(const proba_matrix&) = delete; + proba_matrix(proba_matrix&& other) = default; + proba_matrix& operator=(const proba_matrix&) = delete; + proba_matrix& operator=(proba_matrix&&) = delete; + ~proba_matrix() = default; + + /// capacity + size_t num_branches() const; + size_t num_sites() const; + + // Lookup + mapped_type& operator[](branch_type id); + const mapped_type& at(branch_type id) const; + iterator find(const branch_type& id); + const_iterator find(const branch_type& id) const; + + /// iterators + iterator begin(); + iterator end(); + const_iterator begin() const; + const_iterator end() const; + + private: + storage _data; + }; +} #endif \ No newline at end of file diff --git a/build/src/pp_matrix/row.h b/build/src/pp_matrix/row.h index 01a7e8f..9aace44 100644 --- a/build/src/pp_matrix/row.h +++ b/build/src/pp_matrix/row.h @@ -4,21 +4,16 @@ #include #include -struct alignment +namespace rappas { - /// alignment position type - using pos_type = size_t; + struct proba_pair + { + core::phylo_kmer::score_type score; + size_t index; + }; - static constexpr pos_type not_a_position = std::numeric_limits::max(); -}; - -struct proba_pair -{ - core::phylo_kmer::score_type score; - size_t index; -}; - -using branch_id = uint16_t; -using row = std::array; + using branch_type = core::phylo_kmer::branch_type; + using row_type = std::array; +} #endif //RAPPAS_CPP_ROW_H diff --git a/core b/core index 611222c..be28503 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 611222c3c9f185c5c114a2342f91307ae6882ced +Subproject commit be285036d5f5c26744cf1c9dc0323e089b93f73f