Skip to content

Commit

Permalink
small reformat
Browse files Browse the repository at this point in the history
Signed-off-by: Radu Muntean <[email protected]>
  • Loading branch information
heracle committed Aug 5, 2021
1 parent 76dcb32 commit ed55fc4
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 142 deletions.
95 changes: 0 additions & 95 deletions metagraph/src/annotation/taxonomy/label_to_taxid.cpp

This file was deleted.

142 changes: 116 additions & 26 deletions metagraph/src/annotation/taxonomy/tax_classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "annotation/representation/annotation_matrix/annotation_matrix.hpp"
#include "common/unix_tools.hpp"
#include "common/utils/string_utils.hpp"

#include "common/logger.hpp"

Expand All @@ -14,68 +15,157 @@ namespace annot {

using mtg::common::logger;

void TaxonomyBase::assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map) {
if (utils::starts_with(label, "gi|")) {
// e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome
label_type = GEN_BANK;
*require_accversion_to_taxid_map = true;
} else if (utils::starts_with(utils::split_string(label, ":")[1], "taxid|")) {
// e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome
label_type = TAXID;
*require_accversion_to_taxid_map = false;
} else {
logger->error("Can't determine the type of the given label {}. "
"Make sure the labels are in a recognized format.", label);
exit(1);
}
}

bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid) const {
if (label_type == TAXID) {
*taxid = std::stoul(utils::split_string(label, "|")[1]);
return true;
} else if (TaxonomyBase::label_type == GEN_BANK) {
std::string acc_version = get_accession_version_from_label(label);
if (not accversion_to_taxid_map.count(acc_version)) {
return false;
}
*taxid = accversion_to_taxid_map.at(acc_version);
return true;
}

logger->error("Error: Could not get the taxid for label {}", label);
exit(1);
}

std::string TaxonomyBase::get_accession_version_from_label(const std::string &label) const {
if (label_type == TAXID) {
return utils::split_string(utils::split_string(label, "|")[2], " ")[0];
} else if (label_type == GEN_BANK) {
return utils::split_string(label, "|")[3];;
}

logger->error("Error: Could not get the accession version for label {}", label);
exit(1);
}

// TODO improve this by parsing the compressed ".gz" version (or use https://github.com/pmenzel/taxonomy-tools)
void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath,
const graph::AnnotatedDBG *anno_matrix = NULL) {
std::ifstream f(filepath);
if (!f.good()) {
logger->error("Failed to open accession to taxid map table {}", filepath);
exit(1);
}

std::string line;
getline(f, line);
if (!utils::starts_with(line, "accession\taccession.version\ttaxid\t")) {
logger->error("The accession to taxid map table is not in the standard (*.accession2taxid) format {}",
filepath);
exit(1);
}

tsl::hopscotch_set<std::string> input_accessions;
if (anno_matrix != NULL) {
for (const std::string &label : anno_matrix->get_annotation().get_all_labels()) {
input_accessions.insert(get_accession_version_from_label(label));
}
}

while (getline(f, line)) {
if (line == "") {
logger->error("The accession to taxid map table contains empty lines. "
"Please make sure that this file was not manually modified {}", filepath);
exit(1);
}
std::vector<std::string> parts = utils::split_string(line, "\t");
if (parts.size() <= 2) {
logger->error("The accession to taxid map table contains incomplete lines. "
"Please make sure that this file was not manually modified {}", filepath);
exit(1);
}
if (input_accessions.size() == 0 || input_accessions.count(parts[1])) {
accversion_to_taxid_map[parts[1]] = std::stoul(parts[2]);
}
}
}

TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
const double lca_coverage_rate,
const double kmers_discovery_rate,
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath) :
TaxonomyBase(lca_coverage_rate, kmers_discovery_rate), _anno_matrix(&anno) {
const std::string &label_taxid_map_filepath)
: TaxonomyBase(lca_coverage_rate, kmers_discovery_rate),
_anno_matrix(&anno) {
if (!std::filesystem::exists(tax_tree_filepath)) {
logger->error("Can't open taxonomic tree file {}.", tax_tree_filepath);
std::exit(1);
logger->error("Can't open taxonomic tree file {}", tax_tree_filepath);
exit(1);
}

bool require_accversion_to_taxid_map = false;
assign_label_type(_anno_matrix->get_annotation().get_all_labels()[0], &require_accversion_to_taxid_map);

Timer timer;
if (require_accversion_to_taxid_map) {
logger->trace("Parsing label_taxid_map file..");
logger->trace("Parsing label_taxid_map file...");
read_accversion_to_taxid_map(label_taxid_map_filepath, _anno_matrix);
logger->trace("Finished label_taxid_map file in {}s", timer.elapsed());
logger->trace("Finished label_taxid_map file in {} sec", timer.elapsed());
}

timer.reset();
logger->trace("Parsing taxonomic tree..");
logger->trace("Parsing taxonomic tree...");
ChildrenList tree;
read_tree(tax_tree_filepath, &tree);
logger->trace("Finished taxonomic tree read in {}s.", timer.elapsed());
logger->trace("Finished taxonomic tree read in {} sec.", timer.elapsed());

timer.reset();
logger->trace("Calculating tree statistics..");
logger->trace("Calculating tree statistics...");
std::vector<TaxId> tree_linearization;
dfs_statistics(root_node, tree, &tree_linearization);
logger->trace("Finished tree statistics calculation in {}s.", timer.elapsed());
logger->trace("Finished tree statistics calculation in {} sec.", timer.elapsed());

timer.reset();
logger->trace("Starting rmq preprocessing..");
logger->trace("Starting rmq preprocessing...");
rmq_preprocessing(tree_linearization);
logger->trace("Finished rmq preprocessing in {}s.", timer.elapsed());
logger->trace("Finished rmq preprocessing in {} sec.", timer.elapsed());
}

void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenList *tree) {
std::ifstream f(tax_tree_filepath);
if (!f.good()) {
logger->error("Failed to open Taxonomic Tree file {}.", tax_tree_filepath);
logger->error("Failed to open Taxonomic Tree file {}", tax_tree_filepath);
exit(1);
}

std::string line;
tsl::hopscotch_map<TaxId, TaxId> full_parents_list;
while (getline(f, line)) {
if (line == "") {
logger->error("The Taxonomic Tree file contains empty lines. Please make sure that this file was not manually modified: {}.",
logger->error("The Taxonomic Tree file contains empty lines. "
"Please make sure that this file was not manually modified: {}",
tax_tree_filepath);
exit(1);
}
std::vector<std::string> parts = utils::split_string(line, "\t");
if (parts.size() <= 2) {
logger->error("The Taxonomic tree filepath contains incomplete lines. Please make sure that this file was not manually modified: {}.",
logger->error("The Taxonomic tree filepath contains incomplete lines. "
"Please make sure that this file was not manually modified: {}",
tax_tree_filepath);
exit(1);
}
uint32_t act = static_cast<uint32_t>(std::stoull(parts[0]));
uint32_t parent = static_cast<uint32_t>(std::stoull(parts[2]));
uint32_t act = std::stoul(parts[0]);
uint32_t parent = std::stoul(parts[2]);
full_parents_list[act] = parent;
node_parent[act] = parent;
}
Expand Down Expand Up @@ -118,7 +208,7 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi
}
}
if (num_taxid_failed) {
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} evaluations.",
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} total evaluations.",
tax_tree_filepath, num_taxid_failed, relevant_taxids.size());
}

Expand Down Expand Up @@ -181,24 +271,24 @@ void TaxonomyClsAnno::rmq_preprocessing(const std::vector<TaxId> &tree_lineariza

std::vector<TaxId> TaxonomyClsAnno::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const {
cerr << "Assign class not implemented reversed = " << reversed << "\n";
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size" + to_string(sequence.size()));
exit(0);
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size"
+ to_string(sequence.size()));
}

std::vector<TaxId> TaxonomyClsImportDB::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const {
cerr << "Assign class not implemented reversed = " << reversed << "\n";
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size" + to_string(sequence.size()));
exit(0);
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size"
+ to_string(sequence.size()));
}

TaxId TaxonomyClsAnno::find_lca(const std::vector<TaxId> &taxids) const {
throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size" + to_string(taxids.size()));
exit(0);
throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size"
+ to_string(taxids.size()));
}

TaxId TaxonomyClsImportDB::find_lca(const std::vector<TaxId> &taxids) const {
throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size" + to_string(taxids.size()));
exit(0);
throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size"
+ to_string(taxids.size()));
}

} // namespace annot
Expand Down
25 changes: 9 additions & 16 deletions metagraph/src/annotation/taxonomy/tax_classifier.hpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
#ifndef __TAX_CLASSIFIER_HPP__
#define __TAX_CLASSIFIER_HPP__

#ifdef TESTING
#define PRIVATE_TESTABLE public
#define PROTECTED_TESTABLE public
#else
#define PRIVATE_TESTABLE private
#define PROTECTED_TESTABLE protected
#endif

#include <tsl/hopscotch_set.h>
#include <tsl/hopscotch_map.h>

Expand All @@ -21,7 +13,7 @@ using TaxId = std::uint32_t;
using ChildrenList = tsl::hopscotch_map<TaxId, std::vector<TaxId>>;

class TaxonomyBase {
public:
public:
using KmerId = annot::MultiLabelEncoded<std::string>::Index;
using node_index = graph::SequenceGraph::node_index;

Expand All @@ -32,14 +24,15 @@ class TaxonomyBase {
};

TaxonomyBase() {};
TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate) :
_lca_coverage_rate(lca_coverage_rate), _kmers_discovery_rate(kmers_discovery_rate) {};
TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate)
: _lca_coverage_rate(lca_coverage_rate),
_kmers_discovery_rate(kmers_discovery_rate) {};

virtual ~TaxonomyBase() {};

TaxId assign_class(const std::string &sequence) const;

PROTECTED_TESTABLE:
protected:
void assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map);

virtual TaxId find_lca(const std::vector<TaxId> &taxids) const = 0;
Expand Down Expand Up @@ -102,19 +95,19 @@ class TaxonomyBase {
};

class TaxonomyClsImportDB : public TaxonomyBase {
public:
public:
// todo implement
TaxonomyClsImportDB(const std::string &taxdb_filepath,
const double lca_coverage_rate,
const double kmers_discovery_rate);

PRIVATE_TESTABLE:
private:
std::vector<TaxId> get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const;
TaxId find_lca(const std::vector<TaxId> &taxids) const;
};

class TaxonomyClsAnno : public TaxonomyBase {
public:
public:
/**
* TaxonomyCls constructor
*
Expand All @@ -136,7 +129,7 @@ class TaxonomyClsAnno : public TaxonomyBase {

TaxId assign_class_toplabels(const std::string &sequence, const double label_fraction) const;

PRIVATE_TESTABLE:
private:
/**
* Reads and returns the taxonomic tree as a list of children.
*
Expand Down
Loading

0 comments on commit ed55fc4

Please sign in to comment.