From 8567bff935921c37dc038f186c9036ace5a41096 Mon Sep 17 00:00:00 2001 From: Daniel Danciu Date: Fri, 4 Sep 2020 09:43:12 +0200 Subject: [PATCH 01/51] Intermediate --- metagraph/src/cli/build.cpp | 15 +- metagraph/src/common/elias_fano.cpp | 14 +- metagraph/src/common/elias_fano.hpp | 2 +- .../sorted_sets/sorted_set_disk_base.cpp | 2 +- .../succinct/boss_chunk_construct.cpp | 353 ++++++++++++------ .../succinct/boss_chunk_construct.hpp | 6 +- 6 files changed, 258 insertions(+), 134 deletions(-) diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp index 9e49828e42..2b131f40ac 100644 --- a/metagraph/src/cli/build.cpp +++ b/metagraph/src/cli/build.cpp @@ -11,6 +11,7 @@ #include "graph/representation/bitmap/dbg_bitmap_construct.hpp" #include "graph/representation/succinct/dbg_succinct.hpp" #include "graph/representation/succinct/boss_construct.hpp" +#include "graph/representation/succinct/build_checkpoint.hpp" #include "graph/graph_extensions/node_weights.hpp" #include "config/config.hpp" #include "parse_sequences.hpp" @@ -105,6 +106,8 @@ int build_graph(Config *config) { logger->info("k-mer suffix: '{}'", suffix); } + boss::BuildCheckpoint checkpoint(config->tmp_dir); + auto constructor = boss::IBOSSChunkConstructor::initialize( boss_graph->get_k(), config->canonical, @@ -115,10 +118,18 @@ int build_graph(Config *config) { config->tmp_dir.empty() ? kmer::ContainerType::VECTOR : kmer::ContainerType::VECTOR_DISK, config->tmp_dir, - config->disk_cap_bytes + config->disk_cap_bytes, + checkpoint ); - push_sequences(files, *config, timer, constructor.get()); + if (checkpoint.continuation_phase() == 0) { + push_sequences(files, *config, timer, constructor.get()); + checkpoint.set_phase(1); + checkpoint.set_kmer_dir(constructor->tmp_dir()); + checkpoint.store(); + } else { + logger->info("Skipping parsing sequences from input file(s)"); + } boss::BOSS::Chunk *next_chunk = constructor->build_chunk(); logger->trace("Graph chunk with {} k-mers was built in {} sec", diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp index 36213968fd..3fc1f46100 100644 --- a/metagraph/src/common/elias_fano.cpp +++ b/metagraph/src/common/elias_fano.cpp @@ -14,9 +14,10 @@ namespace mtg { namespace common { -void concat(const std::vector &files, const std::string &result) { +std::vector concat(const std::vector &files, const std::string &result) { if (files.empty()) - return; + return {}; + std::vector original_files; std::vector suffixes = { "", ".up" }; if (std::filesystem::exists(files[0] + ".count")) @@ -24,17 +25,16 @@ void concat(const std::vector &files, const std::string &result) { for (const std::string &suffix : suffixes) { std::string concat_command = "cat "; - for (uint32_t i = 1; i < files.size(); ++i) { + for (uint32_t i = 0; i < files.size(); ++i) { concat_command += files[i] + suffix + " "; } - concat_command += " >> " + files[0] + suffix; - + concat_command += " > " + result + suffix; + logger->trace("Executing '{}'", concat_command); if (std::system(concat_command.c_str())) throw std::runtime_error("Error while cat-ing files: " + concat_command); - std::filesystem::rename(files[0] + suffix, result + suffix); for (const std::string &f : files) { - std::filesystem::remove(f + suffix); + original_files.push_back(f + suffix); } } } diff --git a/metagraph/src/common/elias_fano.hpp b/metagraph/src/common/elias_fano.hpp index 8b35d2013c..cdee2bf45d 100644 --- a/metagraph/src/common/elias_fano.hpp +++ b/metagraph/src/common/elias_fano.hpp @@ -21,7 +21,7 @@ namespace common { * The files store data that is ordered and the values in a file are smaller than the * values in the next file. */ -void concat(const std::vector &files, const std::string &result); +std::vector concat(const std::vector &files, const std::string &result); /** * Elias-Fano encoder that streams the encoded result into a file. diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp index 685ee2a5e7..49c8bc7092 100644 --- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp +++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp @@ -91,7 +91,7 @@ void SortedSetDiskBase::start_merging_async() { async_worker_.enqueue([file_names, this]() { std::function on_new_item = [this](const T &v) { merge_queue_.push(v); }; - merge_files(file_names, on_new_item); + merge_files(file_names, on_new_item, false); merge_queue_.shutdown(); }); } diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp index aaf8196319..38f3c50d71 100644 --- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp +++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp @@ -243,7 +243,8 @@ template void recover_dummy_nodes(const KmerCollector &kmer_collector, Vector &kmers, ChunkedWaitQueue *kmers_out, - ThreadPool &async_worker) { + ThreadPool &async_worker, + const BuildCheckpoint& ) { using KMER = get_first_type_t; using KMER = get_first_type_t; // 64/128/256-bit KmerBOSS with sentinel $ using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer @@ -398,7 +399,8 @@ std::tuple, std::vector, std::string> generate_dummy_1_kmers(size_t k, size_t num_threads, const std::filesystem::path &dir, - ChunkedWaitQueue &kmers) { + ChunkedWaitQueue &kmers, + BuildCheckpoint *checkpoint) { using KMER = get_first_type_t; // 64/128/256-bit KmerExtractorBOSS::KmerBOSS using KMER_INT = typename KMER::WordType; // KmerExtractorBOSS::KmerBOSS::WordType @@ -407,6 +409,8 @@ generate_dummy_1_kmers(size_t k, // for a DNA alphabet, this will contain 16 chunks, split by kmer[0] and kmer[1] std::vector real_F_W = split(k, dir, kmers); + checkpoint->set_phase(3); + checkpoint->store(); const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size(); @@ -421,97 +425,115 @@ generate_dummy_1_kmers(size_t k, dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE); } - logger->trace("Generating dummy-1 source k-mers and dummy sink k-mers..."); - uint64_t num_sink = 0; - uint64_t num_source = 0; + if (checkpoint->continuation_phase() < 3) { + logger->info("Generating dummy-1 source k-mers and dummy sink k-mers..."); + uint64_t num_sink = 0; + uint64_t num_source = 0; - static constexpr size_t L = KMER::kBitsPerChar; - KMER_INT kmer_delta = kmer::get_sentinel_delta(L, k + 1); - // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero - kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L); - - #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1) - for (TAlphabet F = 0; F < alphabet_size; ++F) { - - // stream k-mers of pattern ***F* - std::vector F_chunks(real_F_W.begin() + F * alphabet_size, - real_F_W.begin() + (F + 1) * alphabet_size); - common::MergeDecoder it(F_chunks, false); - - std::vector W_chunks; // chunks with k-mers of the form ****F - for (TAlphabet c = 0; c < alphabet_size; ++c) { - W_chunks.push_back(real_F_W[c * alphabet_size + F]); - } - common::ConcatDecoder sink_gen_it(W_chunks); - - while (!it.empty()) { - KMER_REAL dummy_source(it.pop()); - // skip k-mers that would generate identical source dummy k-mers - skip_same_suffix(dummy_source, it, 0); - dummy_source.to_prev(k + 1, 0); - // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source| - while (!sink_gen_it.empty() - && sink_gen_it.top() < dummy_source.data()) { + static constexpr size_t L = KMER::kBitsPerChar; + KMER_INT kmer_delta = kmer::get_sentinel_delta(L, k + 1); + // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero + kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L); + + #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1) + for (TAlphabet F = 0; F < alphabet_size; ++F) { + // stream k-mers of pattern ***F* + std::vector F_chunks(real_F_W.begin() + F * alphabet_size, + real_F_W.begin() + (F + 1) * alphabet_size); + common::MergeDecoder it(F_chunks, false); + + std::vector W_chunks; // chunks with k-mers of the form ****F + for (TAlphabet c = 0; c < alphabet_size; ++c) { + W_chunks.push_back(real_F_W[c * alphabet_size + F]); + } + common::ConcatDecoder sink_gen_it(W_chunks); + + while (!it.empty()) { + KMER_REAL dummy_source(it.pop()); + // skip k-mers that would generate identical source dummy k-mers + skip_same_suffix(dummy_source, it, 0); + dummy_source.to_prev(k + 1, 0); + // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source| + while (!sink_gen_it.empty() && sink_gen_it.top() < dummy_source.data()) { + KMER_REAL v(sink_gen_it.pop()); + // skip k-mers with the same suffix as v, as they generate identical + // dummy sink k-mers + skip_same_suffix(v, sink_gen_it, 1); + dummy_sink_chunks[F].add(kmer::get_sink_and_lift(v, k + 1)); + num_sink++; + } + if (!sink_gen_it.empty()) { + KMER_REAL top(sink_gen_it.top()); + if (KMER_REAL::compare_suffix(top, dummy_source, 1)) { + // The source dummy k-mer #dummy_source generated from #it is + // redundant iff it shares its suffix with another real k-mer (#top). + // In this case, #top generates a dummy sink k-mer redundant with + // #it. So if #dummy_source is redundant, the sink generated from + // #top is also redundant - so it's being skipped + skip_same_suffix(top, sink_gen_it, 1); + continue; + } + } + // lift all and reset the first character to the sentinel 0 (apply mask) + dummy_l1_chunks[F].add(kmer::transform(dummy_source, k + 1) + + kmer_delta); + num_source++; + } + // handle leftover sink_gen_it + while (!sink_gen_it.empty()) { KMER_REAL v(sink_gen_it.pop()); - // skip k-mers with the same suffix as v, as they generate identical dummy - // sink k-mers skip_same_suffix(v, sink_gen_it, 1); dummy_sink_chunks[F].add(kmer::get_sink_and_lift(v, k + 1)); num_sink++; } - if (!sink_gen_it.empty()) { - KMER_REAL top(sink_gen_it.top()); - if (KMER_REAL::compare_suffix(top, dummy_source, 1)) { - // The source dummy k-mer #dummy_source generated from #it is - // redundant iff it shares its suffix with another real k-mer (#top). - // In this case, #top generates a dummy sink k-mer redundant with #it. - // So if #dummy_source is redundant, the sink generated from #top is - // also redundant - so it's being skipped - skip_same_suffix(top, sink_gen_it, 1); - continue; - } - } - // lift all and reset the first character to the sentinel 0 (apply mask) - dummy_l1_chunks[F].add(kmer::transform(dummy_source, k + 1) + kmer_delta); - num_source++; } - // handle leftover sink_gen_it - while (!sink_gen_it.empty()) { - KMER_REAL v(sink_gen_it.pop()); - skip_same_suffix(v, sink_gen_it, 1); - dummy_sink_chunks[F].add(kmer::get_sink_and_lift(v, k + 1)); - num_sink++; + + for (TAlphabet i = 0; i < alphabet_size; ++i) { + dummy_sink_chunks[i].finish(); + dummy_l1_chunks[i].finish(); } - } - for (TAlphabet i = 0; i < alphabet_size; ++i) { - dummy_sink_chunks[i].finish(); - dummy_l1_chunks[i].finish(); + logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink, + num_source); + checkpoint->set_phase(3); + checkpoint->store(); + } else { + logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers"); } - logger->trace("Generated {} dummy sink and {} dummy source k-mers", - num_sink, num_source); - - // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply - // concatenating the blocks will result in a single ordered block - logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...", - dummy_sink_names.size()); - std::string dummy_sink_name = dir/"dummy_sink"; - common::concat(dummy_sink_names, dummy_sink_name); - - // similarly, the 16 blocks of the original k-mers can be concatenated in groups of - // 4 without destroying the order - logger->trace("Concatenating blocks of original real k-mers ({} -> {})...", - real_F_W.size(), alphabet_size); - std::vector real_split_by_W; + std::vector real_split_by_W(alphabet_size); + std::string dummy_sink_name = dir / "dummy_sink"; for (TAlphabet W = 0; W < alphabet_size; ++W) { - std::vector blocks; - for (TAlphabet F = 0; F < alphabet_size; ++F) { - blocks.push_back(real_F_W[F * alphabet_size + W]); + real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W)); + } + if (checkpoint->continuation_phase() < 4) { + // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply + // concatenating the blocks will result in a single ordered block + logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...", + dummy_sink_names.size()); + std::vector to_delete + = common::concat(dummy_sink_names, dummy_sink_name); + + // similarly, the 16 blocks of the original k-mers can be concatenated in + // groups of 4 without destroying the order + logger->trace("Concatenating blocks of original real k-mers ({} -> {})...", + real_F_W.size(), alphabet_size); + for (TAlphabet W = 0; W < alphabet_size; ++W) { + std::vector blocks; + for (TAlphabet F = 0; F < alphabet_size; ++F) { + blocks.push_back(real_F_W[F * alphabet_size + W]); + } + std::vector original + = common::concat(blocks, real_split_by_W[W]); + to_delete.insert(to_delete.end(), original.begin(), original.end()); } - real_split_by_W.push_back(dir/("real_split_by_W_" + std::to_string(W))); - common::concat(blocks, real_split_by_W.back()); + for (const auto &name : to_delete) { + std::filesystem::remove(name); + } + checkpoint->set_phase(4); + checkpoint->store(); } + return { real_split_by_W, dummy_l1_names, dummy_sink_name }; } @@ -552,47 +574,93 @@ void add_reverse_complements(size_t k, size_t buffer_size, const std::filesystem::path &dir, ThreadPool& async_worker, - ChunkedWaitQueue *kmers) { + ChunkedWaitQueue *kmers, + BuildCheckpoint *checkpoint) { using T_INT_REAL = get_int_t; // either KMER_INT or std::string rc_dir = dir/"rc"; std::filesystem::create_directory(rc_dir); auto rc_set = std::make_unique>( num_threads, buffer_size, rc_dir, std::numeric_limits::max()); - logger->trace("Adding reverse complements..."); + common::EliasFanoEncoderBuffered original(dir/"original", ENCODER_BUFFER_SIZE); Vector buffer; buffer.reserve(10'000); - for (auto &it = kmers->begin(); it != kmers->end(); ++it) { - const T_REAL &kmer = *it; - const T_REAL &reverse = rev_comp(k + 1, *it, KmerExtractor2Bit().complement_code()); - if (get_first(kmer) != get_first(reverse)) { - buffer.push_back(reinterpret_cast(reverse)); - if (buffer.size() == buffer.capacity()) { - rc_set->insert(buffer.begin(), buffer.end()); - buffer.resize(0); - } - original.add(reinterpret_cast(kmer)); - } else { - if constexpr (utils::is_pair_v) { - using C = typename T_REAL::second_type; - if (kmer.second >> (sizeof(C) * 8 - 1)) { - original.add({ kmer.first.data(), std::numeric_limits::max() }); - } else { - original.add({ kmer.first.data(), 2 * kmer.second }); + if (checkpoint->continuation_phase() < 2) { + logger->info("Adding reverse complements..."); + for (auto &it = kmers->begin(); it != kmers->end(); ++it) { + const T_REAL &kmer = *it; + const T_REAL &reverse + = rev_comp(k + 1, *it, KmerExtractor2Bit().complement_code()); + if (get_first(kmer) != get_first(reverse)) { + buffer.push_back(reinterpret_cast(reverse)); + if (buffer.size() == buffer.capacity()) { + rc_set->insert(buffer.begin(), buffer.end()); + buffer.resize(0); } - } else { original.add(reinterpret_cast(kmer)); + } else { + if constexpr (utils::is_pair_v) { + using C = typename T_REAL::second_type; + if (kmer.second >> (sizeof(C) * 8 - 1)) { + original.add({ kmer.first.data(), std::numeric_limits::max() }); + } else { + original.add({ kmer.first.data(), 2 * kmer.second }); + } + } else { + original.add(reinterpret_cast(kmer)); + } } } + rc_set->insert(buffer.begin(), buffer.end()); + original.finish(); + checkpoint->set_phase(2); + checkpoint->store(); + } else { + logger->info("Skipping adding reverse complements"); } - rc_set->insert(buffer.begin(), buffer.end()); - original.finish(); + + if (checkpoint->continuation_phase() == 2) { + logger->info( + "Continuing from checkpoint phase 2. Looking for 'original' and " + "'rc/chunk_*' in {}", + checkpoint->kmer_dir()); + if (!std::filesystem::exists(checkpoint->kmer_dir()/"original")) { + logger->error( + "Could not find {}. Recovery not possible. Remove {} to restart" + "the computation.", + checkpoint->kmer_dir()/"original"); + } + std::vector file_names; + for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir()/"rc")) { + if (path.is_regular_file() + && path.path().filename().string().find("chunk_", 0) == 0 + && path.path().filename().extension() == "") { + logger->trace("Found chunk: {}", path.path().string()); + file_names.push_back(path.path().string()); + } + } + if (file_names.empty()) { + logger->error( + "Could not find chunk_* files in {}. Recovery not possible. " + "Remove temp dir to restart the computation from scratch.", + checkpoint->kmer_dir()); + std::exit(1); + } + rc_set.reset(); + async_worker.enqueue([kmers, file_names = std::move(file_names)]() { + std::function on_new_item + = [&kmers, &file_names](const T_INT_REAL &v) { rc_set.push(v); }; + merge_files(file_names, on_new_item, false); + rc_set.shutdown(); + }); + } + // start merging #original with #reverse_complements into #kmers kmers->reset(); async_worker.enqueue([rc_set = std::move(rc_set), &dir, kmers]() { ChunkedWaitQueue &reverse_complements = rc_set->data(true); - common::EliasFanoDecoder original_kmers(dir / "original"); + common::EliasFanoDecoder original_kmers(dir/"original"); merge(original_kmers, reverse_complements, kmers); }); } @@ -612,7 +680,8 @@ template void recover_dummy_nodes(const KmerCollector &kmer_collector, ChunkedWaitQueue &kmers, ChunkedWaitQueue *kmers_out, - ThreadPool &async_worker) { + ThreadPool &async_worker, + BuildCheckpoint* checkpoint) { using KMER_REAL = get_first_type_t; // 64/128/256-bit KmerBOSS using T_INT_REAL = get_int_t; // either KMER_INT or @@ -620,27 +689,57 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector, using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer size_t k = kmer_collector.get_k() - 1; - const std::filesystem::path dir = kmer_collector.tmp_dir(); + const std::filesystem::path dir = checkpoint->continuation_phase() == 0 + ? kmer_collector.tmp_dir() + : checkpoint->kmer_dir(); size_t num_threads = kmer_collector.num_threads(); + if (checkpoint->continuation_phase() == 1) { + logger->info( + "Continuing from checkpoint phase 1. Looking for chunk_* files in {}", + checkpoint->kmer_dir()); + std::vector file_names; + for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir())) { + if (path.is_regular_file() + && path.path().filename().string().find("chunk_", 0) == 0 + && path.path().filename().extension() == "") { + logger->trace("Found chunk: {}", path.path().string()); + file_names.push_back(path.path().string()); + } + } + if (file_names.empty()) { + logger->error( + "Could not find chunk_* files in {}. Recovery not possible. " + "Remove temp dir to restart the computation from scratch.", + checkpoint->kmer_dir()); + std::exit(1); + } + kmers.reset(); + async_worker.enqueue([kmers, file_names = std::move(file_names)]() { + std::function on_new_item + = [&kmers, &file_names](const T &v) { kmers.push(v); }; + merge_files(file_names, on_new_item, false); + kmers.shutdown(); + }); + } + if (kmer_collector.is_both_strands_mode()) { // compute the reverse complements of #kmers, then merge back into #kmers add_reverse_complements(k, num_threads, kmer_collector.buffer_size(), dir, - async_worker, &kmers); + async_worker, &kmers, checkpoint); } std::string dummy_sink_name; std::vector real_split_by_W; std::vector dummy_names; std::tie(real_split_by_W, dummy_names, dummy_sink_name) - = generate_dummy_1_kmers(k, num_threads, dir, kmers); + = generate_dummy_1_kmers(k, num_threads, dir, kmers, checkpoint); - // stores the sorted original kmers and dummy-1 k-mers + // file names for the dummy_sink_0..3 and dummy_source_0..k_0..3 kmers std::vector dummy_chunks = { dummy_sink_name }; // generate dummy k-mers of prefix length 1..k logger->trace("Starting generating dummy-1..k source k-mers..."); for (size_t dummy_pref_len = 1; dummy_pref_len <= k; ++dummy_pref_len) { - // this will compress all sorted dummy k-mers of given prefix length for (const std::string &f : dummy_names) { dummy_chunks.push_back(f); } @@ -677,8 +776,11 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector, const std::function on_merge = [](const KMER_INT &) {}; common::merge_files(dummy_names, on_merge); - // at this point, we have the original k-mers and dummy-1 k-mers in original_and_dummy_l1, - // the dummy-x k-mers in dummy_source_{x}, and we merge them all into a single stream + checkpoint->set_phase(6); + checkpoint->store(); + + // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers + // in dummy_chunks, and we merge them all into a single stream kmers_out->reset(); // add the main dummy source k-mer @@ -703,7 +805,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector, return kmer::transform(reinterpret_cast(v), k + 1) + kmer_delta; } }, - real_split_by_W, true + real_split_by_W, false /* remove sources */ ); common::Transformed, T> decoder_dummy( @@ -714,7 +816,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector, return reinterpret_cast(v); } }, - dummy_chunks, true + dummy_chunks, false /* remove sources */ ); while (!decoder.empty() && !decoder_dummy.empty()) { @@ -770,7 +872,8 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor { size_t num_threads, double memory_preallocated, const std::filesystem::path &tmp_dir, - size_t max_disk_space) + size_t max_disk_space, + const BuildCheckpoint& checkpoint) : kmer_collector_(k + 1, both_strands_mode, encode_filter_suffix_boss(filter_suffix), @@ -779,26 +882,30 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor { tmp_dir, max_disk_space, both_strands_mode && filter_suffix.empty() /* keep only canonical k-mers */), - bits_per_count_(bits_per_count) { - if (filter_suffix.size() + bits_per_count_(bits_per_count), checkpoint_(checkpoint) { + if (checkpoint.phase() == 0 && filter_suffix.size() && filter_suffix == std::string(filter_suffix.size(), BOSS::kSentinel)) { kmer_collector_.add_kmer(std::vector(k + 1, BOSS::kSentinelCode)); } } - void add_sequence(std::string_view sequence, uint64_t count) { + void add_sequence(std::string_view sequence, uint64_t count) override { kmer_collector_.add_sequence(sequence, count); } - void add_sequences(std::vector&& sequences) { + void add_sequences(std::vector&& sequences) override { kmer_collector_.add_sequences(std::move(sequences)); } - void add_sequences(std::vector>&& sequences) { + void add_sequences(std::vector>&& sequences) override { kmer_collector_.add_sequences(std::move(sequences)); } - BOSS::Chunk* build_chunk() { + std::filesystem::path tmp_dir() const override { + return kmer_collector_.tmp_dir(); + } + + BOSS::Chunk* build_chunk() override { BOSS::Chunk *result; typename KmerCollector::Data &kmer_ints = kmer_collector_.data(); @@ -849,13 +956,14 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor { return result; } - uint64_t get_k() const { return kmer_collector_.get_k() - 1; } + uint64_t get_k() const override { return kmer_collector_.get_k() - 1; } private: KmerCollector kmer_collector_; uint8_t bits_per_count_; /** Used as an async executor for merging chunks from disk */ ThreadPool async_worker_ = ThreadPool(1, 1); + BuildCheckpoint checkpoint_; }; template