From ed55fc4a6b0eda09686ae0a2deda0f341451985b Mon Sep 17 00:00:00 2001 From: Radu Muntean Date: Thu, 5 Aug 2021 11:15:27 +0200 Subject: [PATCH] small reformat Signed-off-by: Radu Muntean --- .../annotation/taxonomy/label_to_taxid.cpp | 95 ------------ .../annotation/taxonomy/tax_classifier.cpp | 142 ++++++++++++++---- .../annotation/taxonomy/tax_classifier.hpp | 25 ++- .../annotation/taxonomy/test_taxonomy.cpp | 9 +- 4 files changed, 129 insertions(+), 142 deletions(-) delete mode 100644 metagraph/src/annotation/taxonomy/label_to_taxid.cpp diff --git a/metagraph/src/annotation/taxonomy/label_to_taxid.cpp b/metagraph/src/annotation/taxonomy/label_to_taxid.cpp deleted file mode 100644 index ba8dbb13f5..0000000000 --- a/metagraph/src/annotation/taxonomy/label_to_taxid.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "tax_classifier.hpp" - -#include "common/utils/string_utils.hpp" - -#include "common/logger.hpp" - -namespace mtg { -namespace annot { - -using mtg::common::logger; - -void TaxonomyBase::assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map) { - if (utils::starts_with(label, "gi|")) { - // e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome - label_type = GEN_BANK; - *require_accversion_to_taxid_map = true; - } else if (utils::starts_with(utils::split_string(label, ":")[1], "taxid|")) { - // e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome - label_type = TAXID; - *require_accversion_to_taxid_map = false; - } else { - logger->error("Can't determine the type of the given label {}. Please make sure that the labels are in a recognized format.", label); - std::exit(1); - } -} - -bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid) const { - if (label_type == TAXID) { - *taxid = static_cast(std::stoull(utils::split_string(label, "|")[1])); - return true; - } else if (TaxonomyBase::label_type == GEN_BANK) { - std::string acc_version = get_accession_version_from_label(label); - if (not accversion_to_taxid_map.count(acc_version)) { - return false; - } - *taxid = accversion_to_taxid_map.at(acc_version); - return true; - } - - logger->error("Run get_taxid_from_label() for unknown label {}.", label); - std::exit(1); -} - -std::string TaxonomyBase::get_accession_version_from_label(const std::string &label) const { - if (label_type == TAXID) { - return utils::split_string(utils::split_string(label, "|")[2], " ")[0]; - } else if (label_type == GEN_BANK) { - return utils::split_string(label, "|")[3];; - } - - logger->error("Run get_accession_version_from_label() for unknown label {}.", label); - std::exit(1); -} - -// TODO improve this by parsing the compressed ".gz" version (or use https://github.com/pmenzel/taxonomy-tools) -void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath, - const graph::AnnotatedDBG *anno_matrix = NULL) { - std::ifstream f(filepath); - if (!f.good()) { - logger->error("Failed to open accession to taxid map table {}", filepath); - exit(1); - } - - std::string line; - getline(f, line); - if (!utils::starts_with(line, "accession\taccession.version\ttaxid\t")) { - logger->error("The accession to taxid map table is not in the standard (*.accession2taxid) format {}.", filepath); - exit(1); - } - - tsl::hopscotch_set input_accessions; - if (anno_matrix != NULL) { - for (const std::string &label : anno_matrix->get_annotation().get_all_labels()) { - input_accessions.insert(get_accession_version_from_label(label)); - } - } - - while (getline(f, line)) { - if (line == "") { - logger->error("The accession to taxid map table contains empty lines. Please make sure that this file was not manually modified {}.", filepath); - exit(1); - } - std::vector parts = utils::split_string(line, "\t"); - if (parts.size() <= 2) { - logger->error("The accession to taxid map table contains incomplete lines. Please make sure that this file was not manually modified {}.", filepath); - exit(1); - } - if (input_accessions.size() == 0 || input_accessions.count(parts[1])) { - accversion_to_taxid_map[parts[1]] = static_cast(std::stoull(parts[2])); - } - } -} - -} // namespace annot -} // namespace mtg diff --git a/metagraph/src/annotation/taxonomy/tax_classifier.cpp b/metagraph/src/annotation/taxonomy/tax_classifier.cpp index 247df4cd94..d8c4a2b125 100644 --- a/metagraph/src/annotation/taxonomy/tax_classifier.cpp +++ b/metagraph/src/annotation/taxonomy/tax_classifier.cpp @@ -6,6 +6,7 @@ #include "annotation/representation/annotation_matrix/annotation_matrix.hpp" #include "common/unix_tools.hpp" +#include "common/utils/string_utils.hpp" #include "common/logger.hpp" @@ -14,15 +15,102 @@ namespace annot { using mtg::common::logger; +void TaxonomyBase::assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map) { + if (utils::starts_with(label, "gi|")) { + // e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome + label_type = GEN_BANK; + *require_accversion_to_taxid_map = true; + } else if (utils::starts_with(utils::split_string(label, ":")[1], "taxid|")) { + // e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome + label_type = TAXID; + *require_accversion_to_taxid_map = false; + } else { + logger->error("Can't determine the type of the given label {}. " + "Make sure the labels are in a recognized format.", label); + exit(1); + } +} + +bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid) const { + if (label_type == TAXID) { + *taxid = std::stoul(utils::split_string(label, "|")[1]); + return true; + } else if (TaxonomyBase::label_type == GEN_BANK) { + std::string acc_version = get_accession_version_from_label(label); + if (not accversion_to_taxid_map.count(acc_version)) { + return false; + } + *taxid = accversion_to_taxid_map.at(acc_version); + return true; + } + + logger->error("Error: Could not get the taxid for label {}", label); + exit(1); +} + +std::string TaxonomyBase::get_accession_version_from_label(const std::string &label) const { + if (label_type == TAXID) { + return utils::split_string(utils::split_string(label, "|")[2], " ")[0]; + } else if (label_type == GEN_BANK) { + return utils::split_string(label, "|")[3];; + } + + logger->error("Error: Could not get the accession version for label {}", label); + exit(1); +} + +// TODO improve this by parsing the compressed ".gz" version (or use https://github.com/pmenzel/taxonomy-tools) +void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath, + const graph::AnnotatedDBG *anno_matrix = NULL) { + std::ifstream f(filepath); + if (!f.good()) { + logger->error("Failed to open accession to taxid map table {}", filepath); + exit(1); + } + + std::string line; + getline(f, line); + if (!utils::starts_with(line, "accession\taccession.version\ttaxid\t")) { + logger->error("The accession to taxid map table is not in the standard (*.accession2taxid) format {}", + filepath); + exit(1); + } + + tsl::hopscotch_set input_accessions; + if (anno_matrix != NULL) { + for (const std::string &label : anno_matrix->get_annotation().get_all_labels()) { + input_accessions.insert(get_accession_version_from_label(label)); + } + } + + while (getline(f, line)) { + if (line == "") { + logger->error("The accession to taxid map table contains empty lines. " + "Please make sure that this file was not manually modified {}", filepath); + exit(1); + } + std::vector parts = utils::split_string(line, "\t"); + if (parts.size() <= 2) { + logger->error("The accession to taxid map table contains incomplete lines. " + "Please make sure that this file was not manually modified {}", filepath); + exit(1); + } + if (input_accessions.size() == 0 || input_accessions.count(parts[1])) { + accversion_to_taxid_map[parts[1]] = std::stoul(parts[2]); + } + } +} + TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno, const double lca_coverage_rate, const double kmers_discovery_rate, const std::string &tax_tree_filepath, - const std::string &label_taxid_map_filepath) : - TaxonomyBase(lca_coverage_rate, kmers_discovery_rate), _anno_matrix(&anno) { + const std::string &label_taxid_map_filepath) + : TaxonomyBase(lca_coverage_rate, kmers_discovery_rate), + _anno_matrix(&anno) { if (!std::filesystem::exists(tax_tree_filepath)) { - logger->error("Can't open taxonomic tree file {}.", tax_tree_filepath); - std::exit(1); + logger->error("Can't open taxonomic tree file {}", tax_tree_filepath); + exit(1); } bool require_accversion_to_taxid_map = false; @@ -30,33 +118,33 @@ TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno, Timer timer; if (require_accversion_to_taxid_map) { - logger->trace("Parsing label_taxid_map file.."); + logger->trace("Parsing label_taxid_map file..."); read_accversion_to_taxid_map(label_taxid_map_filepath, _anno_matrix); - logger->trace("Finished label_taxid_map file in {}s", timer.elapsed()); + logger->trace("Finished label_taxid_map file in {} sec", timer.elapsed()); } timer.reset(); - logger->trace("Parsing taxonomic tree.."); + logger->trace("Parsing taxonomic tree..."); ChildrenList tree; read_tree(tax_tree_filepath, &tree); - logger->trace("Finished taxonomic tree read in {}s.", timer.elapsed()); + logger->trace("Finished taxonomic tree read in {} sec.", timer.elapsed()); timer.reset(); - logger->trace("Calculating tree statistics.."); + logger->trace("Calculating tree statistics..."); std::vector tree_linearization; dfs_statistics(root_node, tree, &tree_linearization); - logger->trace("Finished tree statistics calculation in {}s.", timer.elapsed()); + logger->trace("Finished tree statistics calculation in {} sec.", timer.elapsed()); timer.reset(); - logger->trace("Starting rmq preprocessing.."); + logger->trace("Starting rmq preprocessing..."); rmq_preprocessing(tree_linearization); - logger->trace("Finished rmq preprocessing in {}s.", timer.elapsed()); + logger->trace("Finished rmq preprocessing in {} sec.", timer.elapsed()); } void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenList *tree) { std::ifstream f(tax_tree_filepath); if (!f.good()) { - logger->error("Failed to open Taxonomic Tree file {}.", tax_tree_filepath); + logger->error("Failed to open Taxonomic Tree file {}", tax_tree_filepath); exit(1); } @@ -64,18 +152,20 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi tsl::hopscotch_map full_parents_list; while (getline(f, line)) { if (line == "") { - logger->error("The Taxonomic Tree file contains empty lines. Please make sure that this file was not manually modified: {}.", + logger->error("The Taxonomic Tree file contains empty lines. " + "Please make sure that this file was not manually modified: {}", tax_tree_filepath); exit(1); } std::vector parts = utils::split_string(line, "\t"); if (parts.size() <= 2) { - logger->error("The Taxonomic tree filepath contains incomplete lines. Please make sure that this file was not manually modified: {}.", + logger->error("The Taxonomic tree filepath contains incomplete lines. " + "Please make sure that this file was not manually modified: {}", tax_tree_filepath); exit(1); } - uint32_t act = static_cast(std::stoull(parts[0])); - uint32_t parent = static_cast(std::stoull(parts[2])); + uint32_t act = std::stoul(parts[0]); + uint32_t parent = std::stoul(parts[2]); full_parents_list[act] = parent; node_parent[act] = parent; } @@ -118,7 +208,7 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi } } if (num_taxid_failed) { - logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} evaluations.", + logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} total evaluations.", tax_tree_filepath, num_taxid_failed, relevant_taxids.size()); } @@ -181,24 +271,24 @@ void TaxonomyClsAnno::rmq_preprocessing(const std::vector &tree_lineariza std::vector TaxonomyClsAnno::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const { cerr << "Assign class not implemented reversed = " << reversed << "\n"; - throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size" + to_string(sequence.size())); - exit(0); + throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size" + + to_string(sequence.size())); } std::vector TaxonomyClsImportDB::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const { cerr << "Assign class not implemented reversed = " << reversed << "\n"; - throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size" + to_string(sequence.size())); - exit(0); + throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size" + + to_string(sequence.size())); } TaxId TaxonomyClsAnno::find_lca(const std::vector &taxids) const { - throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size" + to_string(taxids.size())); - exit(0); + throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size" + + to_string(taxids.size())); } TaxId TaxonomyClsImportDB::find_lca(const std::vector &taxids) const { - throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size" + to_string(taxids.size())); - exit(0); + throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size" + + to_string(taxids.size())); } } // namespace annot diff --git a/metagraph/src/annotation/taxonomy/tax_classifier.hpp b/metagraph/src/annotation/taxonomy/tax_classifier.hpp index 12ff6e0e06..f050f8e554 100644 --- a/metagraph/src/annotation/taxonomy/tax_classifier.hpp +++ b/metagraph/src/annotation/taxonomy/tax_classifier.hpp @@ -1,14 +1,6 @@ #ifndef __TAX_CLASSIFIER_HPP__ #define __TAX_CLASSIFIER_HPP__ -#ifdef TESTING -#define PRIVATE_TESTABLE public -#define PROTECTED_TESTABLE public -#else -#define PRIVATE_TESTABLE private -#define PROTECTED_TESTABLE protected -#endif - #include #include @@ -21,7 +13,7 @@ using TaxId = std::uint32_t; using ChildrenList = tsl::hopscotch_map>; class TaxonomyBase { -public: + public: using KmerId = annot::MultiLabelEncoded::Index; using node_index = graph::SequenceGraph::node_index; @@ -32,14 +24,15 @@ class TaxonomyBase { }; TaxonomyBase() {}; - TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate) : - _lca_coverage_rate(lca_coverage_rate), _kmers_discovery_rate(kmers_discovery_rate) {}; + TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate) + : _lca_coverage_rate(lca_coverage_rate), + _kmers_discovery_rate(kmers_discovery_rate) {}; virtual ~TaxonomyBase() {}; TaxId assign_class(const std::string &sequence) const; -PROTECTED_TESTABLE: + protected: void assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map); virtual TaxId find_lca(const std::vector &taxids) const = 0; @@ -102,19 +95,19 @@ class TaxonomyBase { }; class TaxonomyClsImportDB : public TaxonomyBase { -public: + public: // todo implement TaxonomyClsImportDB(const std::string &taxdb_filepath, const double lca_coverage_rate, const double kmers_discovery_rate); -PRIVATE_TESTABLE: + private: std::vector get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const; TaxId find_lca(const std::vector &taxids) const; }; class TaxonomyClsAnno : public TaxonomyBase { -public: + public: /** * TaxonomyCls constructor * @@ -136,7 +129,7 @@ class TaxonomyClsAnno : public TaxonomyBase { TaxId assign_class_toplabels(const std::string &sequence, const double label_fraction) const; -PRIVATE_TESTABLE: + private: /** * Reads and returns the taxonomic tree as a list of children. * diff --git a/metagraph/tests/annotation/taxonomy/test_taxonomy.cpp b/metagraph/tests/annotation/taxonomy/test_taxonomy.cpp index 1543638537..de4e4fe2ce 100644 --- a/metagraph/tests/annotation/taxonomy/test_taxonomy.cpp +++ b/metagraph/tests/annotation/taxonomy/test_taxonomy.cpp @@ -1,14 +1,14 @@ #include "gtest/gtest.h" -#define TESTING - #include #include +#define private public +#define protected public + #include "annotation/taxonomy/tax_classifier.hpp" -namespace mtg { -namespace test { +namespace { TEST (TaxonomyTest, ClsAnno_DfsStatistics) { mtg::annot::TaxonomyClsAnno *tax = new mtg::annot::TaxonomyClsAnno(); @@ -88,4 +88,3 @@ TEST (TaxonomyTest, ClsAnno_RmqPreprocessing) { } } -}