From 8567bff935921c37dc038f186c9036ace5a41096 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Fri, 4 Sep 2020 09:43:12 +0200
Subject: [PATCH 01/51] Intermediate

---
 metagraph/src/cli/build.cpp                   |  15 +-
 metagraph/src/common/elias_fano.cpp           |  14 +-
 metagraph/src/common/elias_fano.hpp           |   2 +-
 .../sorted_sets/sorted_set_disk_base.cpp      |   2 +-
 .../succinct/boss_chunk_construct.cpp         | 353 ++++++++++++------
 .../succinct/boss_chunk_construct.hpp         |   6 +-
 6 files changed, 258 insertions(+), 134 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 9e49828e42..2b131f40ac 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -11,6 +11,7 @@
 #include "graph/representation/bitmap/dbg_bitmap_construct.hpp"
 #include "graph/representation/succinct/dbg_succinct.hpp"
 #include "graph/representation/succinct/boss_construct.hpp"
+#include "graph/representation/succinct/build_checkpoint.hpp"
 #include "graph/graph_extensions/node_weights.hpp"
 #include "config/config.hpp"
 #include "parse_sequences.hpp"
@@ -105,6 +106,8 @@ int build_graph(Config *config) {
                 logger->info("k-mer suffix: '{}'", suffix);
             }
 
+            boss::BuildCheckpoint checkpoint(config->tmp_dir);
+
             auto constructor = boss::IBOSSChunkConstructor::initialize(
                 boss_graph->get_k(),
                 config->canonical,
@@ -115,10 +118,18 @@ int build_graph(Config *config) {
                 config->tmp_dir.empty() ? kmer::ContainerType::VECTOR
                                         : kmer::ContainerType::VECTOR_DISK,
                 config->tmp_dir,
-                config->disk_cap_bytes
+                config->disk_cap_bytes,
+                checkpoint
             );
 
-            push_sequences(files, *config, timer, constructor.get());
+            if (checkpoint.continuation_phase() == 0) {
+                push_sequences(files, *config, timer, constructor.get());
+                checkpoint.set_phase(1);
+                checkpoint.set_kmer_dir(constructor->tmp_dir());
+                checkpoint.store();
+            } else {
+                logger->info("Skipping parsing sequences from input file(s)");
+            }
 
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
             logger->trace("Graph chunk with {} k-mers was built in {} sec",
diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp
index 36213968fd..3fc1f46100 100644
--- a/metagraph/src/common/elias_fano.cpp
+++ b/metagraph/src/common/elias_fano.cpp
@@ -14,9 +14,10 @@
 namespace mtg {
 namespace common {
 
-void concat(const std::vector<std::string> &files, const std::string &result) {
+std::vector<std::string> concat(const std::vector<std::string> &files, const std::string &result) {
     if (files.empty())
-        return;
+        return {};
+    std::vector<std::string> original_files;
 
     std::vector<std::string> suffixes = { "", ".up" };
     if (std::filesystem::exists(files[0] + ".count"))
@@ -24,17 +25,16 @@ void concat(const std::vector<std::string> &files, const std::string &result) {
 
     for (const std::string &suffix : suffixes) {
         std::string concat_command = "cat ";
-        for (uint32_t i = 1; i < files.size(); ++i) {
+        for (uint32_t i = 0; i < files.size(); ++i) {
             concat_command += files[i] + suffix + " ";
         }
-        concat_command += " >> " + files[0] + suffix;
-
+        concat_command += " > " + result + suffix;
+        logger->trace("Executing '{}'", concat_command);
         if (std::system(concat_command.c_str()))
             throw std::runtime_error("Error while cat-ing files: " + concat_command);
 
-        std::filesystem::rename(files[0] + suffix, result + suffix);
         for (const std::string &f : files) {
-            std::filesystem::remove(f + suffix);
+            original_files.push_back(f + suffix);
         }
     }
 }
diff --git a/metagraph/src/common/elias_fano.hpp b/metagraph/src/common/elias_fano.hpp
index 8b35d2013c..cdee2bf45d 100644
--- a/metagraph/src/common/elias_fano.hpp
+++ b/metagraph/src/common/elias_fano.hpp
@@ -21,7 +21,7 @@ namespace common {
  * The files store data that is ordered and the values in a file are smaller than the
  * values in the next file.
  */
-void concat(const std::vector<std::string> &files, const std::string &result);
+std::vector<std::string> concat(const std::vector<std::string> &files, const std::string &result);
 
 /**
  * Elias-Fano encoder that streams the encoded result into a file.
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
index 685ee2a5e7..49c8bc7092 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
@@ -91,7 +91,7 @@ void SortedSetDiskBase<T>::start_merging_async() {
     async_worker_.enqueue([file_names, this]() {
         std::function<void(const T &)> on_new_item
                 = [this](const T &v) { merge_queue_.push(v); };
-        merge_files(file_names, on_new_item);
+        merge_files(file_names, on_new_item, false);
         merge_queue_.shutdown();
     });
 }
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index aaf8196319..38f3c50d71 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -243,7 +243,8 @@ template <typename KmerCollector, typename T_REAL, typename T>
 void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          Vector<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
-                         ThreadPool &async_worker) {
+                         ThreadPool &async_worker,
+                         const BuildCheckpoint& ) {
     using KMER = get_first_type_t<T>;
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $
     using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer
@@ -398,7 +399,8 @@ std::tuple<std::vector<std::string>, std::vector<std::string>, std::string>
 generate_dummy_1_kmers(size_t k,
                        size_t num_threads,
                        const std::filesystem::path &dir,
-                       ChunkedWaitQueue<T_REAL> &kmers) {
+                       ChunkedWaitQueue<T_REAL> &kmers,
+                       BuildCheckpoint *checkpoint) {
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerExtractorBOSS::KmerBOSS
     using KMER_INT = typename KMER::WordType; // KmerExtractorBOSS::KmerBOSS::WordType
 
@@ -407,6 +409,8 @@ generate_dummy_1_kmers(size_t k,
 
     // for a DNA alphabet, this will contain 16 chunks, split by kmer[0] and kmer[1]
     std::vector<std::string> real_F_W = split(k, dir, kmers);
+    checkpoint->set_phase(3);
+    checkpoint->store();
 
     const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
 
@@ -421,97 +425,115 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
     }
 
-    logger->trace("Generating dummy-1 source k-mers and dummy sink k-mers...");
-    uint64_t num_sink = 0;
-    uint64_t num_source = 0;
+    if (checkpoint->continuation_phase() < 3) {
+        logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
+        uint64_t num_sink = 0;
+        uint64_t num_source = 0;
 
-    static constexpr size_t L = KMER::kBitsPerChar;
-    KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
-    // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero
-    kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L);
-
-    #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
-    for (TAlphabet F = 0; F < alphabet_size; ++F) {
-
-        // stream k-mers of pattern ***F*
-        std::vector<std::string> F_chunks(real_F_W.begin() + F * alphabet_size,
-                                          real_F_W.begin() + (F + 1) * alphabet_size);
-        common::MergeDecoder<KMER_INT_REAL> it(F_chunks, false);
-
-        std::vector<std::string> W_chunks;  // chunks with k-mers of the form ****F
-        for (TAlphabet c = 0; c < alphabet_size; ++c) {
-            W_chunks.push_back(real_F_W[c * alphabet_size + F]);
-        }
-        common::ConcatDecoder<KMER_INT_REAL> sink_gen_it(W_chunks);
-
-        while (!it.empty()) {
-            KMER_REAL dummy_source(it.pop());
-            // skip k-mers that would generate identical source dummy k-mers
-            skip_same_suffix(dummy_source, it, 0);
-            dummy_source.to_prev(k + 1, 0);
-            // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source|
-            while (!sink_gen_it.empty()
-                    && sink_gen_it.top() < dummy_source.data()) {
+        static constexpr size_t L = KMER::kBitsPerChar;
+        KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
+        // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero
+        kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L);
+
+        #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
+        for (TAlphabet F = 0; F < alphabet_size; ++F) {
+            // stream k-mers of pattern ***F*
+            std::vector<std::string> F_chunks(real_F_W.begin() + F * alphabet_size,
+                                              real_F_W.begin() + (F + 1) * alphabet_size);
+            common::MergeDecoder<KMER_INT_REAL> it(F_chunks, false);
+
+            std::vector<std::string> W_chunks; // chunks with k-mers of the form ****F
+            for (TAlphabet c = 0; c < alphabet_size; ++c) {
+                W_chunks.push_back(real_F_W[c * alphabet_size + F]);
+            }
+            common::ConcatDecoder<KMER_INT_REAL> sink_gen_it(W_chunks);
+
+            while (!it.empty()) {
+                KMER_REAL dummy_source(it.pop());
+                // skip k-mers that would generate identical source dummy k-mers
+                skip_same_suffix(dummy_source, it, 0);
+                dummy_source.to_prev(k + 1, 0);
+                // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source|
+                while (!sink_gen_it.empty() && sink_gen_it.top() < dummy_source.data()) {
+                    KMER_REAL v(sink_gen_it.pop());
+                    // skip k-mers with the same suffix as v, as they generate identical
+                    // dummy sink k-mers
+                    skip_same_suffix(v, sink_gen_it, 1);
+                    dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
+                    num_sink++;
+                }
+                if (!sink_gen_it.empty()) {
+                    KMER_REAL top(sink_gen_it.top());
+                    if (KMER_REAL::compare_suffix(top, dummy_source, 1)) {
+                        // The source dummy k-mer #dummy_source generated from #it is
+                        // redundant iff it shares its suffix with another real k-mer (#top).
+                        // In this case, #top generates a dummy sink k-mer redundant with
+                        // #it. So if #dummy_source is redundant, the sink generated from
+                        // #top is also redundant - so it's being skipped
+                        skip_same_suffix(top, sink_gen_it, 1);
+                        continue;
+                    }
+                }
+                // lift all and reset the first character to the sentinel 0 (apply mask)
+                dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1)
+                                       + kmer_delta);
+                num_source++;
+            }
+            // handle leftover sink_gen_it
+            while (!sink_gen_it.empty()) {
                 KMER_REAL v(sink_gen_it.pop());
-                // skip k-mers with the same suffix as v, as they generate identical dummy
-                // sink k-mers
                 skip_same_suffix(v, sink_gen_it, 1);
                 dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
                 num_sink++;
             }
-            if (!sink_gen_it.empty()) {
-                KMER_REAL top(sink_gen_it.top());
-                if (KMER_REAL::compare_suffix(top, dummy_source, 1)) {
-                    // The source dummy k-mer #dummy_source generated from #it is
-                    // redundant iff it shares its suffix with another real k-mer (#top).
-                    // In this case, #top generates a dummy sink k-mer redundant with #it.
-                    // So if #dummy_source is redundant, the sink generated from #top is
-                    // also redundant - so it's being skipped
-                    skip_same_suffix(top, sink_gen_it, 1);
-                    continue;
-                }
-            }
-            // lift all and reset the first character to the sentinel 0 (apply mask)
-            dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1) + kmer_delta);
-            num_source++;
         }
-        // handle leftover sink_gen_it
-        while (!sink_gen_it.empty()) {
-            KMER_REAL v(sink_gen_it.pop());
-            skip_same_suffix(v, sink_gen_it, 1);
-            dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
-            num_sink++;
+
+        for (TAlphabet i = 0; i < alphabet_size; ++i) {
+            dummy_sink_chunks[i].finish();
+            dummy_l1_chunks[i].finish();
         }
-    }
 
-    for (TAlphabet i = 0; i < alphabet_size; ++i) {
-        dummy_sink_chunks[i].finish();
-        dummy_l1_chunks[i].finish();
+        logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink,
+                      num_source);
+        checkpoint->set_phase(3);
+        checkpoint->store();
+    } else {
+        logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
     }
 
-    logger->trace("Generated {} dummy sink and {} dummy source k-mers",
-                  num_sink, num_source);
-
-    // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply
-    // concatenating the blocks will result in a single ordered block
-    logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...",
-                  dummy_sink_names.size());
-    std::string dummy_sink_name = dir/"dummy_sink";
-    common::concat(dummy_sink_names, dummy_sink_name);
-
-    // similarly, the 16 blocks of the original k-mers can be concatenated in groups of
-    // 4 without destroying the order
-    logger->trace("Concatenating blocks of original real k-mers ({} -> {})...",
-                  real_F_W.size(), alphabet_size);
-    std::vector<std::string> real_split_by_W;
+    std::vector<std::string> real_split_by_W(alphabet_size);
+    std::string dummy_sink_name = dir / "dummy_sink";
     for (TAlphabet W = 0; W < alphabet_size; ++W) {
-        std::vector<std::string> blocks;
-        for (TAlphabet F = 0; F < alphabet_size; ++F) {
-            blocks.push_back(real_F_W[F * alphabet_size + W]);
+        real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W));
+    }
+    if (checkpoint->continuation_phase() < 4) {
+        // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply
+        // concatenating the blocks will result in a single ordered block
+        logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...",
+                      dummy_sink_names.size());
+        std::vector<std::string> to_delete
+                = common::concat(dummy_sink_names, dummy_sink_name);
+
+        // similarly, the 16 blocks of the original k-mers can be concatenated in
+        // groups of 4 without destroying the order
+        logger->trace("Concatenating blocks of original real k-mers ({} -> {})...",
+                      real_F_W.size(), alphabet_size);
+        for (TAlphabet W = 0; W < alphabet_size; ++W) {
+            std::vector<std::string> blocks;
+            for (TAlphabet F = 0; F < alphabet_size; ++F) {
+                blocks.push_back(real_F_W[F * alphabet_size + W]);
+            }
+            std::vector<std::string> original
+                    = common::concat(blocks, real_split_by_W[W]);
+            to_delete.insert(to_delete.end(), original.begin(), original.end());
         }
-        real_split_by_W.push_back(dir/("real_split_by_W_" + std::to_string(W)));
-        common::concat(blocks, real_split_by_W.back());
+        for (const auto &name : to_delete) {
+            std::filesystem::remove(name);
+        }
+        checkpoint->set_phase(4);
+        checkpoint->store();
     }
+
     return { real_split_by_W, dummy_l1_names, dummy_sink_name };
 }
 
@@ -552,47 +574,93 @@ void add_reverse_complements(size_t k,
                              size_t buffer_size,
                              const std::filesystem::path &dir,
                              ThreadPool& async_worker,
-                             ChunkedWaitQueue<T_REAL> *kmers) {
+                             ChunkedWaitQueue<T_REAL> *kmers,
+                             BuildCheckpoint *checkpoint) {
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
 
     std::string rc_dir = dir/"rc";
     std::filesystem::create_directory(rc_dir);
     auto rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
             num_threads, buffer_size, rc_dir, std::numeric_limits<size_t>::max());
-    logger->trace("Adding reverse complements...");
+
     common::EliasFanoEncoderBuffered<T_INT_REAL> original(dir/"original", ENCODER_BUFFER_SIZE);
     Vector<T_INT_REAL> buffer;
     buffer.reserve(10'000);
-    for (auto &it = kmers->begin(); it != kmers->end(); ++it) {
-        const T_REAL &kmer = *it;
-        const T_REAL &reverse = rev_comp(k + 1, *it, KmerExtractor2Bit().complement_code());
-        if (get_first(kmer) != get_first(reverse)) {
-            buffer.push_back(reinterpret_cast<const T_INT_REAL &>(reverse));
-            if (buffer.size() == buffer.capacity()) {
-                rc_set->insert(buffer.begin(), buffer.end());
-                buffer.resize(0);
-            }
-            original.add(reinterpret_cast<const T_INT_REAL &>(kmer));
-        } else {
-            if constexpr (utils::is_pair_v<T_REAL>) {
-                using C = typename T_REAL::second_type;
-                if (kmer.second >> (sizeof(C) * 8 - 1)) {
-                    original.add({ kmer.first.data(), std::numeric_limits<C>::max() });
-                } else {
-                    original.add({ kmer.first.data(), 2 * kmer.second });
+    if (checkpoint->continuation_phase() < 2) {
+        logger->info("Adding reverse complements...");
+        for (auto &it = kmers->begin(); it != kmers->end(); ++it) {
+            const T_REAL &kmer = *it;
+            const T_REAL &reverse
+                    = rev_comp(k + 1, *it, KmerExtractor2Bit().complement_code());
+            if (get_first(kmer) != get_first(reverse)) {
+                buffer.push_back(reinterpret_cast<const T_INT_REAL &>(reverse));
+                if (buffer.size() == buffer.capacity()) {
+                    rc_set->insert(buffer.begin(), buffer.end());
+                    buffer.resize(0);
                 }
-            } else {
                 original.add(reinterpret_cast<const T_INT_REAL &>(kmer));
+            } else {
+                if constexpr (utils::is_pair_v<T_REAL>) {
+                    using C = typename T_REAL::second_type;
+                    if (kmer.second >> (sizeof(C) * 8 - 1)) {
+                        original.add({ kmer.first.data(), std::numeric_limits<C>::max() });
+                    } else {
+                        original.add({ kmer.first.data(), 2 * kmer.second });
+                    }
+                } else {
+                    original.add(reinterpret_cast<const T_INT_REAL &>(kmer));
+                }
             }
         }
+        rc_set->insert(buffer.begin(), buffer.end());
+        original.finish();
+        checkpoint->set_phase(2);
+        checkpoint->store();
+    } else {
+        logger->info("Skipping adding reverse complements");
     }
-    rc_set->insert(buffer.begin(), buffer.end());
-    original.finish();
+
+    if (checkpoint->continuation_phase() == 2) {
+        logger->info(
+                "Continuing from checkpoint phase 2. Looking for 'original' and "
+                "'rc/chunk_*' in {}",
+                checkpoint->kmer_dir());
+        if (!std::filesystem::exists(checkpoint->kmer_dir()/"original")) {
+            logger->error(
+                    "Could not find {}. Recovery not possible. Remove {} to restart"
+                    "the computation.",
+                    checkpoint->kmer_dir()/"original");
+        }
+        std::vector<std::string> file_names;
+        for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir()/"rc")) {
+            if (path.is_regular_file()
+                    && path.path().filename().string().find("chunk_", 0) == 0
+                    && path.path().filename().extension() == "") {
+                logger->trace("Found chunk: {}", path.path().string());
+                file_names.push_back(path.path().string());
+            }
+        }
+        if (file_names.empty()) {
+            logger->error(
+                    "Could not find chunk_* files in {}. Recovery not possible. "
+                    "Remove temp dir to restart the computation from scratch.",
+                    checkpoint->kmer_dir());
+            std::exit(1);
+        }
+        rc_set.reset();
+        async_worker.enqueue([kmers, file_names = std::move(file_names)]() {
+          std::function<void(const T_INT_REAL &)> on_new_item
+                  = [&kmers, &file_names](const T_INT_REAL &v) { rc_set.push(v); };
+          merge_files(file_names, on_new_item, false);
+          rc_set.shutdown();
+        });
+    }
+
     // start merging #original with #reverse_complements into #kmers
     kmers->reset();
     async_worker.enqueue([rc_set = std::move(rc_set), &dir, kmers]() {
         ChunkedWaitQueue<T_INT_REAL> &reverse_complements = rc_set->data(true);
-        common::EliasFanoDecoder<T_INT_REAL> original_kmers(dir / "original");
+        common::EliasFanoDecoder<T_INT_REAL> original_kmers(dir/"original");
         merge(original_kmers, reverse_complements, kmers);
     });
 }
@@ -612,7 +680,8 @@ template <class KmerCollector, typename T_REAL, typename T>
 void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          ChunkedWaitQueue<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
-                         ThreadPool &async_worker) {
+                         ThreadPool &async_worker,
+                         BuildCheckpoint* checkpoint) {
     using KMER_REAL = get_first_type_t<T_REAL>; // 64/128/256-bit KmerBOSS
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
 
@@ -620,27 +689,57 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer
 
     size_t k = kmer_collector.get_k() - 1;
-    const std::filesystem::path dir = kmer_collector.tmp_dir();
+    const std::filesystem::path dir = checkpoint->continuation_phase() == 0
+            ? kmer_collector.tmp_dir()
+            : checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
 
+    if (checkpoint->continuation_phase() == 1) {
+        logger->info(
+                "Continuing from checkpoint phase 1. Looking for chunk_* files in {}",
+                checkpoint->kmer_dir());
+        std::vector<std::string> file_names;
+        for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir())) {
+            if (path.is_regular_file()
+                && path.path().filename().string().find("chunk_", 0) == 0
+                && path.path().filename().extension() == "") {
+                logger->trace("Found chunk: {}", path.path().string());
+                file_names.push_back(path.path().string());
+            }
+        }
+        if (file_names.empty()) {
+            logger->error(
+                    "Could not find chunk_* files in {}. Recovery not possible. "
+                    "Remove temp dir to restart the computation from scratch.",
+                    checkpoint->kmer_dir());
+            std::exit(1);
+        }
+        kmers.reset();
+        async_worker.enqueue([kmers, file_names = std::move(file_names)]() {
+            std::function<void(const T &)> on_new_item
+                    = [&kmers, &file_names](const T &v) { kmers.push(v); };
+            merge_files(file_names, on_new_item, false);
+            kmers.shutdown();
+        });
+    }
+
     if (kmer_collector.is_both_strands_mode()) {
         // compute the reverse complements of #kmers, then merge back into #kmers
         add_reverse_complements(k, num_threads, kmer_collector.buffer_size(), dir,
-                                async_worker, &kmers);
+                                async_worker, &kmers, checkpoint);
     }
 
     std::string dummy_sink_name;
     std::vector<std::string> real_split_by_W;
     std::vector<std::string> dummy_names;
     std::tie(real_split_by_W, dummy_names, dummy_sink_name)
-            = generate_dummy_1_kmers<T_REAL, T>(k, num_threads, dir, kmers);
+            = generate_dummy_1_kmers<T_REAL, T>(k, num_threads, dir, kmers, checkpoint);
 
-    // stores the sorted original kmers and dummy-1 k-mers
+    // file names for the dummy_sink_0..3 and dummy_source_0..k_0..3 kmers
     std::vector<std::string> dummy_chunks = { dummy_sink_name };
     // generate dummy k-mers of prefix length 1..k
     logger->trace("Starting generating dummy-1..k source k-mers...");
     for (size_t dummy_pref_len = 1; dummy_pref_len <= k; ++dummy_pref_len) {
-        // this will compress all sorted dummy k-mers of given prefix length
         for (const std::string &f : dummy_names) {
             dummy_chunks.push_back(f);
         }
@@ -677,8 +776,11 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     const std::function<void(const KMER_INT &)> on_merge = [](const KMER_INT &) {};
     common::merge_files(dummy_names, on_merge);
 
-    // at this point, we have the original k-mers and dummy-1 k-mers in original_and_dummy_l1,
-    // the dummy-x k-mers in dummy_source_{x}, and we merge them all into a single stream
+    checkpoint->set_phase(6);
+    checkpoint->store();
+
+    // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers
+    // in dummy_chunks, and we merge them all into a single stream
     kmers_out->reset();
 
     // add the main dummy source k-mer
@@ -703,7 +805,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                     return kmer::transform<KMER>(reinterpret_cast<const KMER_REAL &>(v), k + 1) + kmer_delta;
                 }
             },
-            real_split_by_W, true
+            real_split_by_W, false /* remove sources */
         );
 
         common::Transformed<common::MergeDecoder<KMER_INT>, T> decoder_dummy(
@@ -714,7 +816,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                     return reinterpret_cast<const KMER &>(v);
                 }
             },
-            dummy_chunks, true
+            dummy_chunks, false /* remove sources */
         );
 
         while (!decoder.empty() && !decoder_dummy.empty()) {
@@ -770,7 +872,8 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                          size_t num_threads,
                          double memory_preallocated,
                          const std::filesystem::path &tmp_dir,
-                         size_t max_disk_space)
+                         size_t max_disk_space,
+                         const BuildCheckpoint& checkpoint)
         : kmer_collector_(k + 1,
                           both_strands_mode,
                           encode_filter_suffix_boss(filter_suffix),
@@ -779,26 +882,30 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                           tmp_dir,
                           max_disk_space,
                           both_strands_mode && filter_suffix.empty() /* keep only canonical k-mers */),
-          bits_per_count_(bits_per_count) {
-        if (filter_suffix.size()
+          bits_per_count_(bits_per_count), checkpoint_(checkpoint) {
+        if (checkpoint.phase() == 0 && filter_suffix.size()
                 && filter_suffix == std::string(filter_suffix.size(), BOSS::kSentinel)) {
             kmer_collector_.add_kmer(std::vector<TAlphabet>(k + 1, BOSS::kSentinelCode));
         }
     }
 
-    void add_sequence(std::string_view sequence, uint64_t count) {
+    void add_sequence(std::string_view sequence, uint64_t count) override {
         kmer_collector_.add_sequence(sequence, count);
     }
 
-    void add_sequences(std::vector<std::string>&& sequences) {
+    void add_sequences(std::vector<std::string>&& sequences) override {
         kmer_collector_.add_sequences(std::move(sequences));
     }
 
-    void add_sequences(std::vector<std::pair<std::string, uint64_t>>&& sequences) {
+    void add_sequences(std::vector<std::pair<std::string, uint64_t>>&& sequences) override {
         kmer_collector_.add_sequences(std::move(sequences));
     }
 
-    BOSS::Chunk* build_chunk() {
+    std::filesystem::path tmp_dir() const override {
+        return kmer_collector_.tmp_dir();
+    }
+
+    BOSS::Chunk* build_chunk() override {
         BOSS::Chunk *result;
         typename KmerCollector::Data &kmer_ints = kmer_collector_.data();
 
@@ -849,13 +956,14 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         return result;
     }
 
-    uint64_t get_k() const { return kmer_collector_.get_k() - 1; }
+    uint64_t get_k() const override { return kmer_collector_.get_k() - 1; }
 
   private:
     KmerCollector kmer_collector_;
     uint8_t bits_per_count_;
     /** Used as an async executor for merging chunks from disk */
     ThreadPool async_worker_ = ThreadPool(1, 1);
+    BuildCheckpoint checkpoint_;
 };
 
 template <template <typename, class> class KmerContainer, typename... Args>
@@ -950,9 +1058,10 @@ IBOSSChunkConstructor::initialize(size_t k,
                                   double memory_preallocated,
                                   kmer::ContainerType container_type,
                                   const std::filesystem::path &tmp_dir,
-                                  size_t max_disk_space_bytes) {
+                                  size_t max_disk_space_bytes,
+                                  const BuildCheckpoint& checkpoint) {
 #define OTHER_ARGS k, canonical_mode, bits_per_count, filter_suffix, \
-                   num_threads, memory_preallocated, tmp_dir, max_disk_space_bytes
+                   num_threads, memory_preallocated, tmp_dir, max_disk_space_bytes, checkpoint
 
     switch (container_type) {
         case kmer::ContainerType::VECTOR:
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index fa40a71cce..0c22678a2a 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -9,6 +9,7 @@
 #include "kmer/kmer_collector_config.hpp"
 #include "graph/representation/base/dbg_construct.hpp"
 #include "boss_chunk.hpp"
+#include "build_checkpoint.hpp"
 
 
 namespace mtg {
@@ -28,9 +29,12 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                double memory_preallocated = 0,
                mtg::kmer::ContainerType container_type = mtg::kmer::ContainerType::VECTOR,
                const std::filesystem::path &swap_dir = "/tmp/",
-               size_t max_disk_space_bytes = 1e9);
+               size_t max_disk_space_bytes = 1e9,
+               const BuildCheckpoint& checkpoint = BuildCheckpoint("/tmp"));
 
     virtual uint64_t get_k() const = 0;
+
+    virtual std::filesystem::path tmp_dir() const = 0;
 };
 
 } // namespace boss

From 97294cdec9d753f753537beaab8a96c3d4102ca8 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sat, 5 Sep 2020 14:02:32 +0200
Subject: [PATCH 02/51] First attempt

---
 metagraph/src/cli/build.cpp                   |   5 +-
 metagraph/src/common/elias_fano.cpp           |   1 +
 .../succinct/boss_chunk_construct.cpp         | 488 +++++++++---------
 3 files changed, 253 insertions(+), 241 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 2b131f40ac..448ed61f8d 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -122,11 +122,8 @@ int build_graph(Config *config) {
                 checkpoint
             );
 
-            if (checkpoint.continuation_phase() == 0) {
+            if (checkpoint.phase() == 0) {
                 push_sequences(files, *config, timer, constructor.get());
-                checkpoint.set_phase(1);
-                checkpoint.set_kmer_dir(constructor->tmp_dir());
-                checkpoint.store();
             } else {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp
index 3fc1f46100..6a3165465c 100644
--- a/metagraph/src/common/elias_fano.cpp
+++ b/metagraph/src/common/elias_fano.cpp
@@ -37,6 +37,7 @@ std::vector<std::string> concat(const std::vector<std::string> &files, const std
             original_files.push_back(f + suffix);
         }
     }
+    return original_files;
 }
 
 template <class T, class Enable = void>
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 38f3c50d71..c0d7b4110f 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -244,7 +244,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          Vector<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
                          ThreadPool &async_worker,
-                         const BuildCheckpoint& ) {
+                         BuildCheckpoint* ) {
     using KMER = get_first_type_t<T>;
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $
     using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer
@@ -348,15 +348,14 @@ using Decoder = common::EliasFanoDecoder<T>;
 template <typename T_REAL>
 std::vector<std::string> split(size_t k,
                                const std::filesystem::path &dir,
-                               const ChunkedWaitQueue<T_REAL> &kmers) {
+                               const ChunkedWaitQueue<T_REAL> &kmers,
+                               BuildCheckpoint* checkpoint) {
     using T_INT_REAL = get_int_t<T_REAL>;
 
     const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
 
     size_t chunk_count = std::pow(alphabet_size, 2);
 
-    logger->trace("Splitting k-mers into {} chunks...", chunk_count);
-
     std::vector<Encoder<T_INT_REAL>> sinks;
     std::vector<std::string> names(chunk_count);
     for (size_t i = 0; i < names.size(); ++i) {
@@ -364,6 +363,12 @@ std::vector<std::string> split(size_t k,
         sinks.emplace_back(names[i], ENCODER_BUFFER_SIZE);
     }
 
+    if (checkpoint->phase() > 2) {
+        logger->info("Skipping splitting k-mers into chunks");
+        return names;
+    }
+
+    logger->info("Splitting k-mers into {} chunks...", chunk_count);
     size_t num_kmers = 0;
     for (auto &it = kmers.begin(); it != kmers.end(); ++it) {
         const T_REAL &kmer = *it;
@@ -375,6 +380,10 @@ std::vector<std::string> split(size_t k,
     }
     std::for_each(sinks.begin(), sinks.end(), [](auto &f) { f.finish(); });
     logger->trace("Total number of real k-mers: {}", num_kmers);
+
+    checkpoint->set_phase(3);
+    checkpoint->store();
+
     return names;
 }
 
@@ -389,13 +398,60 @@ void skip_same_suffix(const KMER &el, Decoder &decoder, size_t suf) {
     }
 }
 
+std::pair<std::vector<std::string>, std::string>
+concatenate_chunks(const std::filesystem::path &dir,
+                   const std::vector<std::string> &dummy_sink_names,
+                   const std::vector<std::string> &real_F_W,
+                   BuildCheckpoint *checkpoint) {
+    const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
+
+    std::vector<std::string> real_split_by_W(alphabet_size);
+    std::string dummy_sink_name = dir / "dummy_sink";
+    for (TAlphabet W = 0; W < alphabet_size; ++W) {
+        real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W));
+    }
+
+    if (checkpoint->phase() > 4) {
+        return { real_split_by_W, dummy_sink_name };
+    }
+
+    // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply
+    // concatenating the blocks will result in a single ordered block
+    logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...",
+                  dummy_sink_names.size());
+    std::vector<std::string> to_delete
+            = common::concat(dummy_sink_names, dummy_sink_name);
+
+    // similarly, the 16 blocks of the original k-mers can be concatenated in
+    // groups of 4 without destroying the order
+    logger->trace("Concatenating blocks of original real k-mers ({} -> {})...",
+                  real_F_W.size(), alphabet_size);
+    for (TAlphabet W = 0; W < alphabet_size; ++W) {
+        std::vector<std::string> blocks;
+        for (TAlphabet F = 0; F < alphabet_size; ++F) {
+            blocks.push_back(real_F_W[F * alphabet_size + W]);
+        }
+        std::vector<std::string> original
+                = common::concat(blocks, real_split_by_W[W]);
+        to_delete.insert(to_delete.end(), original.begin(), original.end());
+    }
+
+    for (const auto &name : to_delete) {
+        std::filesystem::remove(name);
+    }
+
+    checkpoint->set_phase(5);
+    checkpoint->store();
+    return { real_split_by_W, dummy_sink_name };
+}
+
 /**
  * Generates non-redundant dummy-1 source k-mers and dummy sink kmers from #kmers.
  * @return a triplet containing the names of the original k-mer blocks, the dummy-1 source
  * k-mer blocks and the dummy sink k-mers
  */
 template <typename T_REAL, typename T>
-std::tuple<std::vector<std::string>, std::vector<std::string>, std::string>
+std::pair<std::vector<std::string>, std::vector<std::string>>
 generate_dummy_1_kmers(size_t k,
                        size_t num_threads,
                        const std::filesystem::path &dir,
@@ -408,9 +464,7 @@ generate_dummy_1_kmers(size_t k,
     using KMER_INT_REAL = typename KMER_REAL::WordType; // KmerExtractorT::KmerBOSS::WordType
 
     // for a DNA alphabet, this will contain 16 chunks, split by kmer[0] and kmer[1]
-    std::vector<std::string> real_F_W = split(k, dir, kmers);
-    checkpoint->set_phase(3);
-    checkpoint->store();
+    std::vector<std::string> real_F_W = split(k, dir, kmers, checkpoint);
 
     const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
 
@@ -425,144 +479,84 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
     }
 
-    if (checkpoint->continuation_phase() < 3) {
-        logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
-        uint64_t num_sink = 0;
-        uint64_t num_source = 0;
+    if (checkpoint->phase() > 3) {
+        logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
+        return { dummy_sink_names, real_F_W };
+    }
 
-        static constexpr size_t L = KMER::kBitsPerChar;
-        KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
-        // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero
-        kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L);
+    logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
+    uint64_t num_sink = 0;
+    uint64_t num_source = 0;
 
-        #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
-        for (TAlphabet F = 0; F < alphabet_size; ++F) {
-            // stream k-mers of pattern ***F*
-            std::vector<std::string> F_chunks(real_F_W.begin() + F * alphabet_size,
-                                              real_F_W.begin() + (F + 1) * alphabet_size);
-            common::MergeDecoder<KMER_INT_REAL> it(F_chunks, false);
-
-            std::vector<std::string> W_chunks; // chunks with k-mers of the form ****F
-            for (TAlphabet c = 0; c < alphabet_size; ++c) {
-                W_chunks.push_back(real_F_W[c * alphabet_size + F]);
-            }
-            common::ConcatDecoder<KMER_INT_REAL> sink_gen_it(W_chunks);
-
-            while (!it.empty()) {
-                KMER_REAL dummy_source(it.pop());
-                // skip k-mers that would generate identical source dummy k-mers
-                skip_same_suffix(dummy_source, it, 0);
-                dummy_source.to_prev(k + 1, 0);
-                // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source|
-                while (!sink_gen_it.empty() && sink_gen_it.top() < dummy_source.data()) {
-                    KMER_REAL v(sink_gen_it.pop());
-                    // skip k-mers with the same suffix as v, as they generate identical
-                    // dummy sink k-mers
-                    skip_same_suffix(v, sink_gen_it, 1);
-                    dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
-                    num_sink++;
-                }
-                if (!sink_gen_it.empty()) {
-                    KMER_REAL top(sink_gen_it.top());
-                    if (KMER_REAL::compare_suffix(top, dummy_source, 1)) {
-                        // The source dummy k-mer #dummy_source generated from #it is
-                        // redundant iff it shares its suffix with another real k-mer (#top).
-                        // In this case, #top generates a dummy sink k-mer redundant with
-                        // #it. So if #dummy_source is redundant, the sink generated from
-                        // #top is also redundant - so it's being skipped
-                        skip_same_suffix(top, sink_gen_it, 1);
-                        continue;
-                    }
-                }
-                // lift all and reset the first character to the sentinel 0 (apply mask)
-                dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1)
-                                       + kmer_delta);
-                num_source++;
-            }
-            // handle leftover sink_gen_it
-            while (!sink_gen_it.empty()) {
+    static constexpr size_t L = KMER::kBitsPerChar;
+    KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
+    // reset kmer[1] (the first character in k-mer, $ in dummy source) to zero
+    kmer_delta &= ~KMER_INT(((1ull << L) - 1) << L);
+
+    #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
+    for (TAlphabet F = 0; F < alphabet_size; ++F) {
+        // stream k-mers of pattern ***F*
+        std::vector<std::string> F_chunks(real_F_W.begin() + F * alphabet_size,
+                                          real_F_W.begin() + (F + 1) * alphabet_size);
+        common::MergeDecoder<KMER_INT_REAL> it(F_chunks, false);
+
+        std::vector<std::string> W_chunks; // chunks with k-mers of the form ****F
+        for (TAlphabet c = 0; c < alphabet_size; ++c) {
+            W_chunks.push_back(real_F_W[c * alphabet_size + F]);
+        }
+        common::ConcatDecoder<KMER_INT_REAL> sink_gen_it(W_chunks);
+
+        while (!it.empty()) {
+            KMER_REAL dummy_source(it.pop());
+            // skip k-mers that would generate identical source dummy k-mers
+            skip_same_suffix(dummy_source, it, 0);
+            dummy_source.to_prev(k + 1, 0);
+            // generate dummy sink k-mers from all non-dummy kmers smaller than |dummy_source|
+            while (!sink_gen_it.empty() && sink_gen_it.top() < dummy_source.data()) {
                 KMER_REAL v(sink_gen_it.pop());
+                // skip k-mers with the same suffix as v, as they generate identical
+                // dummy sink k-mers
                 skip_same_suffix(v, sink_gen_it, 1);
                 dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
                 num_sink++;
             }
+            if (!sink_gen_it.empty()) {
+                KMER_REAL top(sink_gen_it.top());
+                if (KMER_REAL::compare_suffix(top, dummy_source, 1)) {
+                    // The source dummy k-mer #dummy_source generated from #it is
+                    // redundant iff it shares its suffix with another real k-mer (#top).
+                    // In this case, #top generates a dummy sink k-mer redundant with
+                    // #it. So if #dummy_source is redundant, the sink generated from
+                    // #top is also redundant - so it's being skipped
+                    skip_same_suffix(top, sink_gen_it, 1);
+                    continue;
+                }
+            }
+            // lift all and reset the first character to the sentinel 0 (apply mask)
+            dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1)
+                                   + kmer_delta);
+            num_source++;
         }
-
-        for (TAlphabet i = 0; i < alphabet_size; ++i) {
-            dummy_sink_chunks[i].finish();
-            dummy_l1_chunks[i].finish();
+        // handle leftover sink_gen_it
+        while (!sink_gen_it.empty()) {
+            KMER_REAL v(sink_gen_it.pop());
+            skip_same_suffix(v, sink_gen_it, 1);
+            dummy_sink_chunks[F].add(kmer::get_sink_and_lift<KMER>(v, k + 1));
+            num_sink++;
         }
-
-        logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink,
-                      num_source);
-        checkpoint->set_phase(3);
-        checkpoint->store();
-    } else {
-        logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
     }
 
-    std::vector<std::string> real_split_by_W(alphabet_size);
-    std::string dummy_sink_name = dir / "dummy_sink";
-    for (TAlphabet W = 0; W < alphabet_size; ++W) {
-        real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W));
-    }
-    if (checkpoint->continuation_phase() < 4) {
-        // dummy sink k-mers are partitioned into blocks by F (kmer[1]), so simply
-        // concatenating the blocks will result in a single ordered block
-        logger->trace("Concatenating blocks of dummy sink k-mers ({} -> 1)...",
-                      dummy_sink_names.size());
-        std::vector<std::string> to_delete
-                = common::concat(dummy_sink_names, dummy_sink_name);
-
-        // similarly, the 16 blocks of the original k-mers can be concatenated in
-        // groups of 4 without destroying the order
-        logger->trace("Concatenating blocks of original real k-mers ({} -> {})...",
-                      real_F_W.size(), alphabet_size);
-        for (TAlphabet W = 0; W < alphabet_size; ++W) {
-            std::vector<std::string> blocks;
-            for (TAlphabet F = 0; F < alphabet_size; ++F) {
-                blocks.push_back(real_F_W[F * alphabet_size + W]);
-            }
-            std::vector<std::string> original
-                    = common::concat(blocks, real_split_by_W[W]);
-            to_delete.insert(to_delete.end(), original.begin(), original.end());
-        }
-        for (const auto &name : to_delete) {
-            std::filesystem::remove(name);
-        }
-        checkpoint->set_phase(4);
-        checkpoint->store();
+    for (TAlphabet i = 0; i < alphabet_size; ++i) {
+        dummy_sink_chunks[i].finish();
+        dummy_l1_chunks[i].finish();
     }
 
-    return { real_split_by_W, dummy_l1_names, dummy_sink_name };
-}
+    logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink,
+                  num_source);
+    checkpoint->set_phase(4);
+    checkpoint->store();
 
-/** Merges #original_kmers with #reverse_complements and places the result into #kmers */
-template <typename T, typename T_INT>
-static void merge(common::EliasFanoDecoder<T_INT> &original_kmers,
-                  ChunkedWaitQueue<T_INT> &reverse_complements,
-                  ChunkedWaitQueue<T> *kmers) {
-    auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT> &>(*kmers);
-    auto &it = reverse_complements.begin();
-    std::optional<T_INT> orig = original_kmers.next();
-    while (it != reverse_complements.end() && orig.has_value()) {
-        if (get_first(orig.value()) < get_first(*it)) {
-            kmers_int.push(orig.value());
-            orig = original_kmers.next();
-        } else {
-            kmers_int.push(*it);
-            ++it;
-        }
-    }
-    while (it != reverse_complements.end()) {
-        kmers_int.push(*it);
-        ++it;
-    }
-    while (orig.has_value()) {
-        kmers_int.push(orig.value());
-        orig = original_kmers.next();
-    }
-    kmers->shutdown();
+    return { dummy_sink_names, real_F_W };
 }
 
 /**
@@ -576,17 +570,49 @@ void add_reverse_complements(size_t k,
                              ThreadPool& async_worker,
                              ChunkedWaitQueue<T_REAL> *kmers,
                              BuildCheckpoint *checkpoint) {
+    if (checkpoint->phase() > 2) {
+        logger->info("Skipping generating reverse complements");
+        return;
+    }
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
 
-    std::string rc_dir = dir/"rc";
-    std::filesystem::create_directory(rc_dir);
-    auto rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
-            num_threads, buffer_size, rc_dir, std::numeric_limits<size_t>::max());
-
-    common::EliasFanoEncoderBuffered<T_INT_REAL> original(dir/"original", ENCODER_BUFFER_SIZE);
-    Vector<T_INT_REAL> buffer;
-    buffer.reserve(10'000);
-    if (checkpoint->continuation_phase() < 2) {
+    std::vector<std::string> to_merge = { dir/"original" };
+    if (checkpoint->phase() == 2) {
+        logger->info(
+                "Continuing from checkpoint phase 2. Looking for 'original' and "
+                "'rc/chunk_*' in {}",
+                checkpoint->kmer_dir());
+        if (!std::filesystem::exists(checkpoint->kmer_dir()/"original")) {
+            logger->error(
+                    "Could not find {}. Recovery not possible. Remove tmp dir to "
+                    "restart the computation.",
+                    checkpoint->kmer_dir()/"original");
+            std::exit(1);
+        }
+        for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir()/"rc")) {
+            if (path.is_regular_file()
+                    && path.path().filename().string().find("chunk_", 0) == 0
+                    && path.path().filename().extension() == "") {
+                logger->trace("Found chunk: {}", path.path().string());
+                to_merge.push_back(path.path().string());
+            }
+        }
+        if (to_merge.size() == 1) {
+            logger->error(
+                    "Could not find chunk_* files in {}. Recovery not possible. "
+                    "Remove temp dir to restart the computation from scratch.",
+                    checkpoint->kmer_dir());
+            std::exit(1);
+        }
+    } else { //  checkpoint->phase() < 2
+        std::string rc_dir = dir/"rc";
+        std::filesystem::create_directory(rc_dir);
+        auto rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
+                num_threads, buffer_size, rc_dir, std::numeric_limits<size_t>::max());
+
+        common::EliasFanoEncoderBuffered<T_INT_REAL> original(dir/"original", ENCODER_BUFFER_SIZE);
+        Vector<T_INT_REAL> buffer;
+        buffer.reserve(10'000);
         logger->info("Adding reverse complements...");
         for (auto &it = kmers->begin(); it != kmers->end(); ++it) {
             const T_REAL &kmer = *it;
@@ -613,55 +639,24 @@ void add_reverse_complements(size_t k,
             }
         }
         rc_set->insert(buffer.begin(), buffer.end());
+        std::vector<std::string> to_insert = rc_set->files_to_merge();
+        to_merge.insert(to_merge.end(), to_insert.begin(), to_insert.end());
         original.finish();
         checkpoint->set_phase(2);
         checkpoint->store();
-    } else {
-        logger->info("Skipping adding reverse complements");
-    }
-
-    if (checkpoint->continuation_phase() == 2) {
-        logger->info(
-                "Continuing from checkpoint phase 2. Looking for 'original' and "
-                "'rc/chunk_*' in {}",
-                checkpoint->kmer_dir());
-        if (!std::filesystem::exists(checkpoint->kmer_dir()/"original")) {
-            logger->error(
-                    "Could not find {}. Recovery not possible. Remove {} to restart"
-                    "the computation.",
-                    checkpoint->kmer_dir()/"original");
-        }
-        std::vector<std::string> file_names;
-        for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir()/"rc")) {
-            if (path.is_regular_file()
-                    && path.path().filename().string().find("chunk_", 0) == 0
-                    && path.path().filename().extension() == "") {
-                logger->trace("Found chunk: {}", path.path().string());
-                file_names.push_back(path.path().string());
-            }
-        }
-        if (file_names.empty()) {
-            logger->error(
-                    "Could not find chunk_* files in {}. Recovery not possible. "
-                    "Remove temp dir to restart the computation from scratch.",
-                    checkpoint->kmer_dir());
-            std::exit(1);
-        }
-        rc_set.reset();
-        async_worker.enqueue([kmers, file_names = std::move(file_names)]() {
-          std::function<void(const T_INT_REAL &)> on_new_item
-                  = [&kmers, &file_names](const T_INT_REAL &v) { rc_set.push(v); };
-          merge_files(file_names, on_new_item, false);
-          rc_set.shutdown();
-        });
     }
 
     // start merging #original with #reverse_complements into #kmers
     kmers->reset();
-    async_worker.enqueue([rc_set = std::move(rc_set), &dir, kmers]() {
-        ChunkedWaitQueue<T_INT_REAL> &reverse_complements = rc_set->data(true);
-        common::EliasFanoDecoder<T_INT_REAL> original_kmers(dir/"original");
-        merge(original_kmers, reverse_complements, kmers);
+    async_worker.enqueue([rc_files = std::move(to_merge), kmers]() {
+        common::MergeDecoder<T_INT_REAL> chunked_kmers(rc_files, false);
+
+        auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(*kmers);
+        std::optional<T_INT_REAL> kmer;
+        while ((kmer = chunked_kmers.pop()).has_value()) {
+            kmers_int.push(kmer.value());
+        }
+        kmers->shutdown();
     });
 }
 
@@ -688,13 +683,18 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $
     using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer
 
+    uint32_t previous_phase = checkpoint->phase();
+    if (checkpoint->phase() == 0) {
+        checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
+        checkpoint->set_phase(1);
+        checkpoint->store();
+    }
+
     size_t k = kmer_collector.get_k() - 1;
-    const std::filesystem::path dir = checkpoint->continuation_phase() == 0
-            ? kmer_collector.tmp_dir()
-            : checkpoint->kmer_dir();
+    const std::filesystem::path dir = checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
 
-    if (checkpoint->continuation_phase() == 1) {
+    if (previous_phase == 1) {
         logger->info(
                 "Continuing from checkpoint phase 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
@@ -715,11 +715,14 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
             std::exit(1);
         }
         kmers.reset();
-        async_worker.enqueue([kmers, file_names = std::move(file_names)]() {
-            std::function<void(const T &)> on_new_item
-                    = [&kmers, &file_names](const T &v) { kmers.push(v); };
-            merge_files(file_names, on_new_item, false);
+        async_worker.enqueue([&kmers, file_names = std::move(file_names)]() {
+          auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(kmers);
+            std::function<void(const T_INT_REAL &)> on_new_item
+                    = [&kmers_int](const T_INT_REAL &v) { kmers_int.push(v); };
+            common::merge_files(file_names, on_new_item, false);
             kmers.shutdown();
+            std::for_each(file_names.begin(), file_names.end(),
+                          [](const auto &f) { std::filesystem::remove(f); });
         });
     }
 
@@ -729,55 +732,66 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                                 async_worker, &kmers, checkpoint);
     }
 
-    std::string dummy_sink_name;
-    std::vector<std::string> real_split_by_W;
-    std::vector<std::string> dummy_names;
-    std::tie(real_split_by_W, dummy_names, dummy_sink_name)
+    auto [dummy_sink_names, real_F_W]
             = generate_dummy_1_kmers<T_REAL, T>(k, num_threads, dir, kmers, checkpoint);
 
-    // file names for the dummy_sink_0..3 and dummy_source_0..k_0..3 kmers
-    std::vector<std::string> dummy_chunks = { dummy_sink_name };
-    // generate dummy k-mers of prefix length 1..k
-    logger->trace("Starting generating dummy-1..k source k-mers...");
-    for (size_t dummy_pref_len = 1; dummy_pref_len <= k; ++dummy_pref_len) {
-        for (const std::string &f : dummy_names) {
-            dummy_chunks.push_back(f);
-        }
+    std::vector<std::string> real_split_by_W;
+    std::string dummy_sink_name;
+    std::tie(real_split_by_W, dummy_sink_name)
+            = concatenate_chunks(dir, dummy_sink_names, real_F_W, checkpoint);
 
-        const uint8_t alphabet_size = KmerExtractorBOSS::alphabet.size();
-        std::vector<std::string> dummy_next_names(alphabet_size);
-        std::vector<Encoder<KMER_INT>> dummy_next_chunks;
+    // file names for the dummy_sink and dummy_source_1..k_0..3 kmers
+    std::vector<std::string> dummy_chunk_names;
+    const uint8_t alphabet_size = KmerExtractorBOSS::alphabet.size();
+    for (size_t dummy_pref_len = 1; dummy_pref_len <= k; ++dummy_pref_len) {
         for (TAlphabet i = 0; i < alphabet_size; ++i) {
-            dummy_next_names[i] = dir/("dummy_source_"
-                    + std::to_string(dummy_pref_len + 1) + "_" + std::to_string(i));
-            dummy_next_chunks.emplace_back(dummy_next_names[i], ENCODER_BUFFER_SIZE);
+            std::string suffix
+                    = std::to_string(dummy_pref_len + 1) + "_" + std::to_string(i);
+            dummy_chunk_names.push_back(dir/("dummy_source_" + suffix));
         }
+    }
+    dummy_chunk_names.push_back(dummy_sink_name);
 
-        KMER prev_kmer(0);
-        uint64_t num_kmers = 0;
-        const std::function<void(const KMER_INT &)> &write_dummy = [&](const KMER_INT &v) {
-            KMER kmer(v);
-            kmer.to_prev(k + 1, BOSS::kSentinelCode);
-            if (prev_kmer != kmer) {
-                dummy_next_chunks[kmer[0]].add(kmer.data());
-                prev_kmer = std::move(kmer);
+    if (checkpoint->phase() < 6) {
+        // generate dummy k-mers of prefix length 1..k
+        logger->trace("Starting generating dummy-1..{} source k-mers...", k);
+        for (size_t dummy_pref_len = 2; dummy_pref_len <= k; ++dummy_pref_len) {
+
+            std::vector<Encoder<KMER_INT>> next_chunks;
+            for (TAlphabet i = 0; i < alphabet_size; ++i) {
+                next_chunks.emplace_back(dummy_chunk_names[dummy_pref_len * alphabet_size + i],
+                                          ENCODER_BUFFER_SIZE);
             }
-            num_kmers++;
-        };
-        common::merge_files(dummy_names, write_dummy, false);
-
-        std::for_each(dummy_next_chunks.begin(), dummy_next_chunks.end(),
-                      [](auto &v) { v.finish(); });
-        dummy_names = std::move(dummy_next_names);
-        logger->trace("Number of dummy k-mers with dummy prefix of length {}: {}",
-                      dummy_pref_len, num_kmers);
-    }
-    // remove the last chunks with .up and .count
-    const std::function<void(const KMER_INT &)> on_merge = [](const KMER_INT &) {};
-    common::merge_files(dummy_names, on_merge);
 
-    checkpoint->set_phase(6);
-    checkpoint->store();
+            // the chunks containing (dummy_pref_len-1) dummy k-mers
+            auto begin = dummy_chunk_names.begin() + (dummy_pref_len - 1) * alphabet_size;
+            std::vector<std::string> current_names(begin, begin + alphabet_size);
+
+            KMER prev_kmer(0);
+            uint64_t num_kmers = 0;
+            const std::function<void(const KMER_INT &)> &write_dummy
+                    = [&](const KMER_INT &v) {
+                          KMER kmer(v);
+                          kmer.to_prev(k + 1, BOSS::kSentinelCode);
+                          if (prev_kmer != kmer) {
+                              next_chunks[kmer[0]].add(kmer.data());
+                              prev_kmer = std::move(kmer);
+                          }
+                          num_kmers++;
+                      };
+            common::merge_files(current_names, write_dummy, false);
+
+            std::for_each(next_chunks.begin(), next_chunks.end(),
+                          [](auto &v) { v.finish(); });
+            logger->trace("Number of dummy k-mers with dummy prefix of length {}: {}",
+                          dummy_pref_len - 1, num_kmers);
+        }
+
+        checkpoint->set_phase(6);
+        checkpoint->store();
+    } else {
+        logger->info("Skipping generating dummy-1..{} source k-mers", k);
+    }
 
     // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers
     // in dummy_chunks, and we merge them all into a single stream
@@ -794,7 +808,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
             = kmer::get_sentinel_delta<KMER_INT>(KMER::kBitsPerChar, k + 1);
 
     // push all other dummy and non-dummy k-mers to |kmers_out|
-    async_worker.enqueue([k, kmer_delta, kmers_out, real_split_by_W, dummy_chunks]() {
+    async_worker.enqueue([k, kmer_delta, kmers_out, real_split_by_W, dummy_chunk_names]() {
         common::Transformed<common::MergeDecoder<T_INT_REAL>, T> decoder(
             [&](const T_INT_REAL &v) {
                 if constexpr (utils::is_pair_v<T>) {
@@ -816,7 +830,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                     return reinterpret_cast<const KMER &>(v);
                 }
             },
-            dummy_chunks, false /* remove sources */
+                dummy_chunk_names, false /* remove sources */
         );
 
         while (!decoder.empty() && !decoder_dummy.empty()) {
@@ -883,7 +897,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                           max_disk_space,
                           both_strands_mode && filter_suffix.empty() /* keep only canonical k-mers */),
           bits_per_count_(bits_per_count), checkpoint_(checkpoint) {
-        if (checkpoint.phase() == 0 && filter_suffix.size()
+        if (filter_suffix.size()
                 && filter_suffix == std::string(filter_suffix.size(), BOSS::kSentinel)) {
             kmer_collector_.add_kmer(std::vector<TAlphabet>(k + 1, BOSS::kSentinelCode));
         }
@@ -933,7 +947,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
 
 #define INIT_CHUNK(KMER) \
     ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE); \
-    recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_); \
+    recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_); \
     logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed()); \
     result = new BOSS::Chunk(KmerExtractorBOSS().alphabet.size(), \
                              kmer_collector_.get_k() - 1, \
@@ -961,7 +975,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
   private:
     KmerCollector kmer_collector_;
     uint8_t bits_per_count_;
-    /** Used as an async executor for merging chunks from disk */
+    /** Async executor for merging chunks, generating reverse complements, etc. */
     ThreadPool async_worker_ = ThreadPool(1, 1);
     BuildCheckpoint checkpoint_;
 };

From 1f731ff18f21558a1b44a8a16e88b3e802d7ab80 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 6 Sep 2020 18:42:33 +0200
Subject: [PATCH 03/51] Running'

---
 metagraph/src/common/elias_fano.cpp           |  1 -
 .../sorted_sets/sorted_set_disk_base.cpp      | 10 ++--
 .../sorted_sets/sorted_set_disk_base.hpp      |  2 +-
 .../succinct/boss_chunk_construct.cpp         | 55 +++++++++++--------
 4 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp
index 6a3165465c..f8a6170e5f 100644
--- a/metagraph/src/common/elias_fano.cpp
+++ b/metagraph/src/common/elias_fano.cpp
@@ -29,7 +29,6 @@ std::vector<std::string> concat(const std::vector<std::string> &files, const std
             concat_command += files[i] + suffix + " ";
         }
         concat_command += " > " + result + suffix;
-        logger->trace("Executing '{}'", concat_command);
         if (std::system(concat_command.c_str()))
             throw std::runtime_error("Error while cat-ing files: " + concat_command);
 
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
index 49c8bc7092..87678abe20 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
@@ -64,13 +64,15 @@ std::vector<std::string> SortedSetDiskBase<T>::files_to_merge() {
 }
 
 template <typename T>
-void SortedSetDiskBase<T>::clear(const std::filesystem::path &tmp_path) {
+void SortedSetDiskBase<T>::clear(const std::filesystem::path &tmp_path, bool remove_files) {
     std::unique_lock<std::mutex> exclusive_lock(mutex_);
     std::unique_lock<std::shared_timed_mutex> multi_insert_lock(multi_insert_mutex_);
     is_merging_ = false;
-    // remove the files that have not been requested to merge
-    for (const auto &chunk_file : get_file_names()) {
-        std::filesystem::remove(chunk_file);
+    if (remove_files) {
+        // remove the files that have not been requested to merge
+        for (const auto &chunk_file : get_file_names()) {
+            std::filesystem::remove(chunk_file);
+        }
     }
     chunk_count_ = 0;
     l1_chunk_count_ = 0;
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index bc60cb4342..2ba2603723 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -74,7 +74,7 @@ class SortedSetDiskBase {
      * sorted set may be expensive when #data_ is large. In these cases, prefer calling
      * #clear and re-using the buffer.
      */
-    void clear(const std::filesystem::path &tmp_path = "/tmp/");
+    void clear(const std::filesystem::path &tmp_path = "/tmp/", bool remove_files = true);
 
   protected:
     /** Advances #it by step or points to #end, whichever comes first. */
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index c0d7b4110f..5a6f7ca1e4 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -360,7 +360,6 @@ std::vector<std::string> split(size_t k,
     std::vector<std::string> names(chunk_count);
     for (size_t i = 0; i < names.size(); ++i) {
         names[i] = dir/("real_F_W_" + std::to_string(i));
-        sinks.emplace_back(names[i], ENCODER_BUFFER_SIZE);
     }
 
     if (checkpoint->phase() > 2) {
@@ -368,6 +367,10 @@ std::vector<std::string> split(size_t k,
         return names;
     }
 
+    for (size_t i = 0; i < names.size(); ++i) {
+        sinks.emplace_back(names[i], ENCODER_BUFFER_SIZE);
+    }
+
     logger->info("Splitting k-mers into {} chunks...", chunk_count);
     size_t num_kmers = 0;
     for (auto &it = kmers.begin(); it != kmers.end(); ++it) {
@@ -475,8 +478,6 @@ generate_dummy_1_kmers(size_t k,
     for (TAlphabet i = 0; i < alphabet_size; ++i) {
         dummy_l1_names[i] = dir/("dummy_source_1_" + std::to_string(i));
         dummy_sink_names[i] = dir/("dummy_sink_" + std::to_string(i));
-        dummy_l1_chunks.emplace_back(dummy_l1_names[i], ENCODER_BUFFER_SIZE);
-        dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
     }
 
     if (checkpoint->phase() > 3) {
@@ -484,6 +485,11 @@ generate_dummy_1_kmers(size_t k,
         return { dummy_sink_names, real_F_W };
     }
 
+    for (TAlphabet i = 0; i < alphabet_size; ++i) {
+        dummy_l1_chunks.emplace_back(dummy_l1_names[i], ENCODER_BUFFER_SIZE);
+        dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
+    }
+
     logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
     uint64_t num_sink = 0;
     uint64_t num_source = 0;
@@ -576,6 +582,7 @@ void add_reverse_complements(size_t k,
     }
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
 
+    std::unique_ptr<common::SortedSetDisk<T_INT_REAL>> rc_set;
     std::vector<std::string> to_merge = { dir/"original" };
     if (checkpoint->phase() == 2) {
         logger->info(
@@ -607,7 +614,7 @@ void add_reverse_complements(size_t k,
     } else { //  checkpoint->phase() < 2
         std::string rc_dir = dir/"rc";
         std::filesystem::create_directory(rc_dir);
-        auto rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
+        rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
                 num_threads, buffer_size, rc_dir, std::numeric_limits<size_t>::max());
 
         common::EliasFanoEncoderBuffered<T_INT_REAL> original(dir/"original", ENCODER_BUFFER_SIZE);
@@ -641,6 +648,7 @@ void add_reverse_complements(size_t k,
         rc_set->insert(buffer.begin(), buffer.end());
         std::vector<std::string> to_insert = rc_set->files_to_merge();
         to_merge.insert(to_merge.end(), to_insert.begin(), to_insert.end());
+        rc_set->clear(dir, false /* don't delete chunk files! */);
         original.finish();
         checkpoint->set_phase(2);
         checkpoint->store();
@@ -648,13 +656,11 @@ void add_reverse_complements(size_t k,
 
     // start merging #original with #reverse_complements into #kmers
     kmers->reset();
-    async_worker.enqueue([rc_files = std::move(to_merge), kmers]() {
-        common::MergeDecoder<T_INT_REAL> chunked_kmers(rc_files, false);
-
+    async_worker.enqueue([to_merge = std::move(to_merge), kmers]() {
+        common::MergeDecoder<T_INT_REAL> chunked_kmers(to_merge, false);
         auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(*kmers);
-        std::optional<T_INT_REAL> kmer;
-        while ((kmer = chunked_kmers.pop()).has_value()) {
-            kmers_int.push(kmer.value());
+        while (!chunked_kmers.empty()) {
+            kmers_int.push(chunked_kmers.pop());
         }
         kmers->shutdown();
     });
@@ -672,16 +678,16 @@ void add_reverse_complements(size_t k,
  * the dummy-k kmers, for k=2..k
  */
 template <class KmerCollector, typename T_REAL, typename T>
-void recover_dummy_nodes(const KmerCollector &kmer_collector,
+[[clang::optnone]] void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          ChunkedWaitQueue<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
                          ThreadPool &async_worker,
                          BuildCheckpoint* checkpoint) {
-    using KMER_REAL = get_first_type_t<T_REAL>; // 64/128/256-bit KmerBOSS
-    using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
+    using KMER_REAL = get_first_type_t<T_REAL>; // 64/128/256-bit KmerBOSS on 2 bits
+    using T_INT_REAL = get_int_t<T_REAL>; // either KMER_REAL or <KMER_REAL, count>
 
-    using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $
-    using KMER_INT = typename KMER::WordType; // 64/128/256-bit integer
+    using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
+    using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
     uint32_t previous_phase = checkpoint->phase();
     if (checkpoint->phase() == 0) {
@@ -742,11 +748,10 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
 
     // file names for the dummy_sink and dummy_source_1..k_0..3 kmers
     std::vector<std::string> dummy_chunk_names;
-    const uint8_t alphabet_size = KmerExtractorBOSS::alphabet.size();
+    const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
     for (size_t dummy_pref_len = 1; dummy_pref_len <= k; ++dummy_pref_len) {
         for (TAlphabet i = 0; i < alphabet_size; ++i) {
-            std::string suffix
-                    = std::to_string(dummy_pref_len + 1) + "_" + std::to_string(i);
+            std::string suffix = std::to_string(dummy_pref_len) + "_" + std::to_string(i);
             dummy_chunk_names.push_back(dir/("dummy_source_" + suffix));
         }
     }
@@ -755,15 +760,16 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     if (checkpoint->phase() < 6) {
         // generate dummy k-mers of prefix length 1..k
         logger->trace("Starting generating dummy-1..{} source k-mers...", k);
-        for (size_t dummy_pref_len = 2; dummy_pref_len <= k; ++dummy_pref_len) {
+        for (size_t dummy_pref_len = 1; dummy_pref_len < k; ++dummy_pref_len) {
 
             std::vector<Encoder<KMER_INT>> next_chunks;
             for (TAlphabet i = 0; i < alphabet_size; ++i) {
-                next_chunks.emplace_back(dummy_chunk_names[dummy_pref_len * alphabet_size + i],
-                                          ENCODER_BUFFER_SIZE);
+                next_chunks.emplace_back(
+                        dummy_chunk_names[dummy_pref_len * alphabet_size + i],
+                        ENCODER_BUFFER_SIZE);
             }
 
-            // the chunks containing (dummy_pref_len-1) dummy k-mers
+            // chunks containing dummy k-mers of prefix length dummy_pref_len
             auto begin = dummy_chunk_names.begin() + (dummy_pref_len - 1) * alphabet_size;
             std::vector<std::string> current_names(begin, begin + alphabet_size);
 
@@ -772,9 +778,10 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
             const std::function<void(const KMER_INT &)> &write_dummy
                     = [&](const KMER_INT &v) {
                           KMER kmer(v);
+                          assert(kmer[0]);
                           kmer.to_prev(k + 1, BOSS::kSentinelCode);
                           if (prev_kmer != kmer) {
-                              next_chunks[kmer[0]].add(kmer.data());
+                              next_chunks[kmer[0] - 1].add(kmer.data());
                               prev_kmer = std::move(kmer);
                           }
                           num_kmers++;
@@ -784,7 +791,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
             std::for_each(next_chunks.begin(), next_chunks.end(),
                           [](auto &v) { v.finish(); });
             logger->trace("Number of dummy k-mers with dummy prefix of length {}: {}",
-                          dummy_pref_len - 1, num_kmers);
+                          dummy_pref_len, num_kmers);
         }
 
         checkpoint->set_phase(6);

From 7e981bad14433d9c8fc2cb6b8d8695fa9e46bb36 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 6 Sep 2020 21:29:45 +0200
Subject: [PATCH 04/51] Clear checkpoint

---
 metagraph/src/cli/build.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 448ed61f8d..4d5011f9f0 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -107,6 +107,13 @@ int build_graph(Config *config) {
             }
 
             boss::BuildCheckpoint checkpoint(config->tmp_dir);
+            if (checkpoint.phase() > 0 && suffixes.size() > 1) {
+                logger->error(
+                        "Checkpointing for multiple chunks not supported. "
+                        "Remove {} or continue building chunk by chunk",
+                        checkpoint.checkpoint_file());
+                std::exit(1);
+            }
 
             auto constructor = boss::IBOSSChunkConstructor::initialize(
                 boss_graph->get_k(),
@@ -148,6 +155,7 @@ int build_graph(Config *config) {
             } else {
                 graph_data.reset(next_chunk);
             }
+            checkpoint.done();
         }
 
         assert(graph_data);

From b9fbc53281b4637817fc46506c87b06b054f3ff0 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 7 Sep 2020 09:36:59 +0200
Subject: [PATCH 05/51] Small

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 5a6f7ca1e4..54b03aa369 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -837,7 +837,7 @@ template <class KmerCollector, typename T_REAL, typename T>
                     return reinterpret_cast<const KMER &>(v);
                 }
             },
-                dummy_chunk_names, false /* remove sources */
+            dummy_chunk_names, false /* remove sources */
         );
 
         while (!decoder.empty() && !decoder_dummy.empty()) {

From d83368ddb72ae6771f997294a989bce3f5682ded Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 7 Sep 2020 12:03:42 +0200
Subject: [PATCH 06/51] Add --phase

---
 metagraph/src/cli/build.cpp                   | 10 +--------
 metagraph/src/cli/config/config.cpp           | 21 +++++++++++++++++++
 metagraph/src/cli/config/config.hpp           |  2 ++
 .../succinct/boss_chunk_construct.hpp         |  2 +-
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 4d5011f9f0..5b47e297fc 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -106,15 +106,7 @@ int build_graph(Config *config) {
                 logger->info("k-mer suffix: '{}'", suffix);
             }
 
-            boss::BuildCheckpoint checkpoint(config->tmp_dir);
-            if (checkpoint.phase() > 0 && suffixes.size() > 1) {
-                logger->error(
-                        "Checkpointing for multiple chunks not supported. "
-                        "Remove {} or continue building chunk by chunk",
-                        checkpoint.checkpoint_file());
-                std::exit(1);
-            }
-
+            boss::BuildCheckpoint checkpoint(config->checkpoint, config->tmp_dir);
             auto constructor = boss::IBOSSChunkConstructor::initialize(
                 boss_graph->get_k(),
                 config->canonical,
diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp
index b637704a55..6fe6d9bc84 100644
--- a/metagraph/src/cli/config/config.cpp
+++ b/metagraph/src/cli/config/config.cpp
@@ -334,6 +334,8 @@ Config::Config(int argc, char *argv[]) {
             tmp_dir = get_value(i++);
         } else if (!strcmp(argv[i], "--disk-cap-gb")) {
             disk_cap_bytes = atoi(get_value(i++)) * 1e9;
+        } else if (!strcmp(argv[i], "--checkpoint")) {
+            checkpoint = true;
         } else if (argv[i][0] == '-') {
             fprintf(stderr, "\nERROR: Unknown option %s\n\n", argv[i]);
             print_usage(argv[0], identity);
@@ -521,6 +523,24 @@ Config::Config(int argc, char *argv[]) {
     if (identity == COMPARE && fnames.size() != 2)
         print_usage_and_exit = true;
 
+    if (identity != BUILD && checkpoint) {
+        std::cerr << "Error: Checkpointing is only supported for disk-based building. "
+                     "Remove --checkpoint.";
+        print_usage_and_exit = true;
+    }
+
+    if (checkpoint && tmp_dir.empty()) {
+        std::cerr << "Error: Checkpointing is only supported for disk-based building. "
+                     "Please set --disk-swap.";
+        print_usage_and_exit = true;
+    }
+
+    if (suffix_len > 0) {
+        std::cerr << "Error: Checkpointing not supported for multiple suffixes. "
+                     "Remove --checkpoint or specify each suffix separately using --suffix";
+        print_usage_and_exit = true;
+    }
+
     if (discovery_fraction < 0 || discovery_fraction > 1)
         print_usage_and_exit = true;
 
@@ -751,6 +771,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) {
             fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
             fprintf(stderr, "\t   --disk-swap [STR] \tdirectory to use for temporary files [off]\n");
             fprintf(stderr, "\t   --disk-cap-gb [INT] \tmax temp disk space to use before forcing a merge, in GB [20]\n");
+            fprintf(stderr, "\t   --checkpoint \t whether to save intermediate state in --disk-swap in order to resume an interrupted computation [off]\n");
         } break;
         case CLEAN: {
             fprintf(stderr, "Usage: %s clean -o <outfile-base> [options] GRAPH\n\n", prog_name.c_str());
diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp
index e9c435fe7e..9b2fb3764d 100644
--- a/metagraph/src/cli/config/config.hpp
+++ b/metagraph/src/cli/config/config.hpp
@@ -142,6 +142,8 @@ class Config {
 
     size_t disk_cap_bytes = 20e9; // 20GB default
 
+    bool checkpoint = true;
+
     enum IdentityType {
         NO_IDENTITY = -1,
         BUILD = 1,
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index 0c22678a2a..8950b3be88 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -30,7 +30,7 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                mtg::kmer::ContainerType container_type = mtg::kmer::ContainerType::VECTOR,
                const std::filesystem::path &swap_dir = "/tmp/",
                size_t max_disk_space_bytes = 1e9,
-               const BuildCheckpoint& checkpoint = BuildCheckpoint("/tmp"));
+               const BuildCheckpoint& checkpoint = BuildCheckpoint(false, ""));
 
     virtual uint64_t get_k() const = 0;
 

From 0b27107079c94941a3efb7cd7957047e0a55a7d6 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 8 Sep 2020 15:58:17 +0200
Subject: [PATCH 07/51] Working checkpointing

---
 metagraph/src/cli/build.cpp                   | 12 ++-
 metagraph/src/cli/config/config.cpp           | 21 +++---
 metagraph/src/cli/config/config.hpp           |  2 +-
 metagraph/src/common/utils/file_utils.cpp     |  7 +-
 metagraph/src/common/utils/file_utils.hpp     |  3 +-
 .../succinct/boss_chunk_construct.cpp         | 74 ++++++++++---------
 .../succinct/boss_chunk_construct.hpp         |  2 +-
 .../succinct/build_checkpoint.cpp             | 54 ++++++++++++++
 .../succinct/build_checkpoint.hpp             | 40 ++++++++++
 metagraph/src/kmer/kmer_collector.cpp         |  3 +-
 10 files changed, 165 insertions(+), 53 deletions(-)
 create mode 100644 metagraph/src/graph/representation/succinct/build_checkpoint.cpp
 create mode 100644 metagraph/src/graph/representation/succinct/build_checkpoint.hpp

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 5b47e297fc..de6ce38b8f 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -106,7 +106,9 @@ int build_graph(Config *config) {
                 logger->info("k-mer suffix: '{}'", suffix);
             }
 
-            boss::BuildCheckpoint checkpoint(config->checkpoint, config->tmp_dir);
+            bool checkpoint_enabled = !config->tmp_dir.empty() && suffixes.size() == 1;
+            boss::BuildCheckpoint checkpoint(checkpoint_enabled, config->outfbase,
+                                             config->phase);
             auto constructor = boss::IBOSSChunkConstructor::initialize(
                 boss_graph->get_k(),
                 config->canonical,
@@ -121,16 +123,20 @@ int build_graph(Config *config) {
                 checkpoint
             );
 
-            if (checkpoint.phase() == 0) {
+            if (checkpoint.checkpoint() == 0) {
                 push_sequences(files, *config, timer, constructor.get());
             } else {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
+
+            if (checkpoint.phase() < 2) { // phase 1 stops after generating dummy k-mers
+                assert(next_chunk == nullptr);
+                return 0;
+            }
             logger->trace("Graph chunk with {} k-mers was built in {} sec",
                           next_chunk->size() - 1, timer.elapsed());
-
             if (config->suffix.size()) {
                 logger->info("Serialize the graph chunk for suffix '{}'...", suffix);
                 timer.reset();
diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp
index 6fe6d9bc84..6206720f04 100644
--- a/metagraph/src/cli/config/config.cpp
+++ b/metagraph/src/cli/config/config.cpp
@@ -334,8 +334,8 @@ Config::Config(int argc, char *argv[]) {
             tmp_dir = get_value(i++);
         } else if (!strcmp(argv[i], "--disk-cap-gb")) {
             disk_cap_bytes = atoi(get_value(i++)) * 1e9;
-        } else if (!strcmp(argv[i], "--checkpoint")) {
-            checkpoint = true;
+        } else if (!strcmp(argv[i], "--phase")) {
+            phase = atoi(get_value(i++));
         } else if (argv[i][0] == '-') {
             fprintf(stderr, "\nERROR: Unknown option %s\n\n", argv[i]);
             print_usage(argv[0], identity);
@@ -523,21 +523,20 @@ Config::Config(int argc, char *argv[]) {
     if (identity == COMPARE && fnames.size() != 2)
         print_usage_and_exit = true;
 
-    if (identity != BUILD && checkpoint) {
-        std::cerr << "Error: Checkpointing is only supported for disk-based building. "
-                     "Remove --checkpoint.";
+    if (identity != BUILD && phase != 2) {
+        std::cerr << "Error: Phases are only supported for building. Remove --phase.";
         print_usage_and_exit = true;
     }
 
-    if (checkpoint && tmp_dir.empty()) {
-        std::cerr << "Error: Checkpointing is only supported for disk-based building. "
+    if (phase != 2 && tmp_dir.empty()) {
+        std::cerr << "Error: Phases are only supported for disk-based building. "
                      "Please set --disk-swap.";
         print_usage_and_exit = true;
     }
 
-    if (suffix_len > 0) {
-        std::cerr << "Error: Checkpointing not supported for multiple suffixes. "
-                     "Remove --checkpoint or specify each suffix separately using --suffix";
+    if (phase != 2 && suffix_len > 0) {
+        std::cerr << "Error: Phases are not supported for multiple suffixes. "
+                     "Remove --phase or specify each suffix separately using --suffix";
         print_usage_and_exit = true;
     }
 
@@ -771,7 +770,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) {
             fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
             fprintf(stderr, "\t   --disk-swap [STR] \tdirectory to use for temporary files [off]\n");
             fprintf(stderr, "\t   --disk-cap-gb [INT] \tmax temp disk space to use before forcing a merge, in GB [20]\n");
-            fprintf(stderr, "\t   --checkpoint \t whether to save intermediate state in --disk-swap in order to resume an interrupted computation [off]\n");
+            fprintf(stderr, "\t   --phase [INT] \tmax where to stop the computation (1=generate kmers, 2= build all) [2]\n");
         } break;
         case CLEAN: {
             fprintf(stderr, "Usage: %s clean -o <outfile-base> [options] GRAPH\n\n", prog_name.c_str());
diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp
index 9b2fb3764d..c3b97c38c3 100644
--- a/metagraph/src/cli/config/config.hpp
+++ b/metagraph/src/cli/config/config.hpp
@@ -142,7 +142,7 @@ class Config {
 
     size_t disk_cap_bytes = 20e9; // 20GB default
 
-    bool checkpoint = true;
+    uint32_t phase = 2;
 
     enum IdentityType {
         NO_IDENTITY = -1,
diff --git a/metagraph/src/common/utils/file_utils.cpp b/metagraph/src/common/utils/file_utils.cpp
index 9d0e7f646c..91f964999c 100644
--- a/metagraph/src/common/utils/file_utils.cpp
+++ b/metagraph/src/common/utils/file_utils.cpp
@@ -38,7 +38,8 @@ void cleanup_tmp_dir_on_exit() {
 }
 
 std::filesystem::path create_temp_dir(std::filesystem::path path,
-                                      const std::string &name) {
+                                      const std::string &name,
+                                      bool clean_on_exit) {
     if (path.empty())
         path = "./";
 
@@ -48,6 +49,10 @@ std::filesystem::path create_temp_dir(std::filesystem::path path,
         exit(1);
     }
 
+    if (!clean_on_exit) {
+        return tmp_dir_str;
+    }
+
     if (TMP_DIRS.empty()) {
         if (std::signal(SIGINT, cleanup_tmp_dir_on_signal) == SIG_ERR)
             logger->error("Couldn't reset the signal handler for SIGINT");
diff --git a/metagraph/src/common/utils/file_utils.hpp b/metagraph/src/common/utils/file_utils.hpp
index 1975acc136..8430f743c8 100644
--- a/metagraph/src/common/utils/file_utils.hpp
+++ b/metagraph/src/common/utils/file_utils.hpp
@@ -14,7 +14,8 @@
 namespace utils {
 
 std::filesystem::path create_temp_dir(std::filesystem::path path,
-                                      const std::string &name = "");
+                                      const std::string &name = "",
+                                      bool clean_on_exit = true);
 
 
 bool check_if_writable(const std::string &filename);
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 54b03aa369..6ef36a2664 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -362,7 +362,7 @@ std::vector<std::string> split(size_t k,
         names[i] = dir/("real_F_W_" + std::to_string(i));
     }
 
-    if (checkpoint->phase() > 2) {
+    if (checkpoint->checkpoint() > 2) {
         logger->info("Skipping splitting k-mers into chunks");
         return names;
     }
@@ -384,7 +384,7 @@ std::vector<std::string> split(size_t k,
     std::for_each(sinks.begin(), sinks.end(), [](auto &f) { f.finish(); });
     logger->trace("Total number of real k-mers: {}", num_kmers);
 
-    checkpoint->set_phase(3);
+    checkpoint->set_checkpoint(3);
     checkpoint->store();
 
     return names;
@@ -414,7 +414,7 @@ concatenate_chunks(const std::filesystem::path &dir,
         real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W));
     }
 
-    if (checkpoint->phase() > 4) {
+    if (checkpoint->checkpoint() > 4) {
         return { real_split_by_W, dummy_sink_name };
     }
 
@@ -443,7 +443,7 @@ concatenate_chunks(const std::filesystem::path &dir,
         std::filesystem::remove(name);
     }
 
-    checkpoint->set_phase(5);
+    checkpoint->set_checkpoint(5);
     checkpoint->store();
     return { real_split_by_W, dummy_sink_name };
 }
@@ -480,7 +480,7 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_names[i] = dir/("dummy_sink_" + std::to_string(i));
     }
 
-    if (checkpoint->phase() > 3) {
+    if (checkpoint->checkpoint() > 3) {
         logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
         return { dummy_sink_names, real_F_W };
     }
@@ -559,7 +559,7 @@ generate_dummy_1_kmers(size_t k,
 
     logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink,
                   num_source);
-    checkpoint->set_phase(4);
+    checkpoint->set_checkpoint(4);
     checkpoint->store();
 
     return { dummy_sink_names, real_F_W };
@@ -576,7 +576,7 @@ void add_reverse_complements(size_t k,
                              ThreadPool& async_worker,
                              ChunkedWaitQueue<T_REAL> *kmers,
                              BuildCheckpoint *checkpoint) {
-    if (checkpoint->phase() > 2) {
+    if (checkpoint->checkpoint() > 2) {
         logger->info("Skipping generating reverse complements");
         return;
     }
@@ -584,7 +584,7 @@ void add_reverse_complements(size_t k,
 
     std::unique_ptr<common::SortedSetDisk<T_INT_REAL>> rc_set;
     std::vector<std::string> to_merge = { dir/"original" };
-    if (checkpoint->phase() == 2) {
+    if (checkpoint->checkpoint() == 2) {
         logger->info(
                 "Continuing from checkpoint phase 2. Looking for 'original' and "
                 "'rc/chunk_*' in {}",
@@ -611,7 +611,7 @@ void add_reverse_complements(size_t k,
                     checkpoint->kmer_dir());
             std::exit(1);
         }
-    } else { //  checkpoint->phase() < 2
+    } else { //  checkpoint->checkpoint() < 2
         std::string rc_dir = dir/"rc";
         std::filesystem::create_directory(rc_dir);
         rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
@@ -650,7 +650,7 @@ void add_reverse_complements(size_t k,
         to_merge.insert(to_merge.end(), to_insert.begin(), to_insert.end());
         rc_set->clear(dir, false /* don't delete chunk files! */);
         original.finish();
-        checkpoint->set_phase(2);
+        checkpoint->set_checkpoint(2);
         checkpoint->store();
     }
 
@@ -689,10 +689,10 @@ template <class KmerCollector, typename T_REAL, typename T>
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
-    uint32_t previous_phase = checkpoint->phase();
-    if (checkpoint->phase() == 0) {
+    uint32_t previous_phase = checkpoint->checkpoint();
+    if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
-        checkpoint->set_phase(1);
+        checkpoint->set_checkpoint(1);
         checkpoint->store();
     }
 
@@ -757,7 +757,7 @@ template <class KmerCollector, typename T_REAL, typename T>
     }
     dummy_chunk_names.push_back(dummy_sink_name);
 
-    if (checkpoint->phase() < 6) {
+    if (checkpoint->checkpoint() < 6) {
         // generate dummy k-mers of prefix length 1..k
         logger->trace("Starting generating dummy-1..{} source k-mers...", k);
         for (size_t dummy_pref_len = 1; dummy_pref_len < k; ++dummy_pref_len) {
@@ -794,7 +794,7 @@ template <class KmerCollector, typename T_REAL, typename T>
                           dummy_pref_len, num_kmers);
         }
 
-        checkpoint->set_phase(6);
+        checkpoint->set_checkpoint(6);
         checkpoint->store();
     } else {
         logger->info("Skipping generating dummy-1..{} source k-mers", k);
@@ -903,7 +903,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                           tmp_dir,
                           max_disk_space,
                           both_strands_mode && filter_suffix.empty() /* keep only canonical k-mers */),
-          bits_per_count_(bits_per_count), checkpoint_(checkpoint) {
+          bits_per_count_(bits_per_count), checkpoint_(checkpoint), tmp_dir_(tmp_dir) {
         if (filter_suffix.size()
                 && filter_suffix == std::string(filter_suffix.size(), BOSS::kSentinel)) {
             kmer_collector_.add_kmer(std::vector<TAlphabet>(k + 1, BOSS::kSentinelCode));
@@ -926,6 +926,25 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         return kmer_collector_.tmp_dir();
     }
 
+    template <typename KMER, typename T, typename Container>
+    BOSS::Chunk *build_chunk_2bit(Container &kmers) {
+        logger->trace("Reconstructing all required dummy source k-mers...");
+
+        Timer timer;
+        ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE);
+        recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
+        logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
+        if (checkpoint_.phase() == 1) {
+            logger->info("Finished building phase 1");
+            queue.reset();
+            return nullptr;
+        }
+        return new BOSS::Chunk(KmerExtractorBOSS().alphabet.size(),
+                               kmer_collector_.get_k() - 1,
+                               kmer_collector_.is_both_strands_mode(), queue,
+                               bits_per_count_, tmp_dir_);
+    }
+
     BOSS::Chunk* build_chunk() override {
         BOSS::Chunk *result;
         typename KmerCollector::Data &kmer_ints = kmer_collector_.data();
@@ -944,31 +963,17 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                                      kmers,
                                      bits_per_count_,
                                      kmer_collector_.tmp_dir());
-        } else {
+        } else {  // KmerExtractor2Bit
             static_assert(std::is_same_v<typename KmerCollector::Extractor,
                                          KmerExtractor2Bit>);
             assert(!kmer_collector_.suffix_length());
 
-            logger->trace("Reconstructing all required dummy source k-mers...");
-            Timer timer;
-
-#define INIT_CHUNK(KMER) \
-    ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE); \
-    recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_); \
-    logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed()); \
-    result = new BOSS::Chunk(KmerExtractorBOSS().alphabet.size(), \
-                             kmer_collector_.get_k() - 1, \
-                             kmer_collector_.is_both_strands_mode(), \
-                             queue, \
-                             bits_per_count_,                                     \
-                             kmer_collector_.tmp_dir())
-
             if (kmer_collector_.get_k() * KmerExtractorBOSS::bits_per_char <= 64) {
-                INIT_CHUNK(KmerExtractorBOSS::Kmer64);
+                result = build_chunk_2bit<KmerExtractorBOSS::Kmer64, T>(kmers);
             } else if (kmer_collector_.get_k() * KmerExtractorBOSS::bits_per_char <= 128) {
-                INIT_CHUNK(KmerExtractorBOSS::Kmer128);
+                result = build_chunk_2bit<KmerExtractorBOSS::Kmer128, T>(kmers);
             } else {
-                INIT_CHUNK(KmerExtractorBOSS::Kmer256);
+                result = build_chunk_2bit<KmerExtractorBOSS::Kmer256, T>(kmers);
             }
         }
 
@@ -985,6 +990,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
     /** Async executor for merging chunks, generating reverse complements, etc. */
     ThreadPool async_worker_ = ThreadPool(1, 1);
     BuildCheckpoint checkpoint_;
+    std::filesystem::path tmp_dir_;
 };
 
 template <template <typename, class> class KmerContainer, typename... Args>
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index 8950b3be88..e5958a4ea1 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -30,7 +30,7 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                mtg::kmer::ContainerType container_type = mtg::kmer::ContainerType::VECTOR,
                const std::filesystem::path &swap_dir = "/tmp/",
                size_t max_disk_space_bytes = 1e9,
-               const BuildCheckpoint& checkpoint = BuildCheckpoint(false, ""));
+               const BuildCheckpoint& checkpoint = BuildCheckpoint(false, "", 2));
 
     virtual uint64_t get_k() const = 0;
 
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
new file mode 100644
index 0000000000..a5d22f93d1
--- /dev/null
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
@@ -0,0 +1,54 @@
+#include "build_checkpoint.hpp"
+
+#include "common/logger.hpp"
+
+namespace mtg {
+namespace graph {
+namespace boss {
+
+BuildCheckpoint::BuildCheckpoint(bool enabled,
+                                 const std::filesystem::path &output_prefix,
+                                 uint32_t phase)
+    : enabled_(enabled),
+      phase_(phase),
+      checkpoint_(0),
+      checkpoint_file_(output_prefix.string() + ".checkpoint") {
+    if (!enabled_) {
+        return;
+    }
+    assert(!output_prefix.empty());
+    if (std::filesystem::exists(checkpoint_file_)) {
+        std::ifstream f(checkpoint_file_);
+        f >> checkpoint_;
+        if (checkpoint_ > 0) {
+            f >> kmer_dir_;
+            if (!std::filesystem::exists(kmer_dir_)) {
+                common::logger->error("Found checkpoint {}, but the k-mer directory {} "
+                        "doesn't exist. Remove the checkpoint to start the computation "
+                        "from scratch", checkpoint_file_, kmer_dir_);
+                std::exit(1);
+            }
+            common::logger->info(
+                    "Found an interrupted computation, at phase {}, kmer directory "
+                    "{}. Will attempt to continue - if this is not intended, please "
+                    "remove the checkpoint file {}",
+                    checkpoint_, kmer_dir_, checkpoint_file_);
+        }
+    }
+}
+
+void BuildCheckpoint::store() const {
+    if (!enabled_) {
+        return;
+    }
+    std::ofstream f(checkpoint_file_);
+    f << checkpoint_ << std::endl;
+    if (checkpoint_ > 0) {
+        f << kmer_dir_;
+    }
+    f.close();
+}
+
+} // namespace boss
+} // namespace graph
+} // namespace mtg
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
new file mode 100644
index 0000000000..e21d6b37c9
--- /dev/null
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -0,0 +1,40 @@
+#pragma once
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+
+namespace mtg {
+namespace graph {
+namespace boss {
+
+/**
+ * Stores checkpointing information for resuming disk-based building of succinct graphs.
+ */
+class BuildCheckpoint {
+  public:
+    BuildCheckpoint(bool enabled, const std::filesystem::path &output_prefix, uint32_t phase);
+
+    uint32_t phase() const { return phase_; }
+    uint32_t checkpoint() const { return checkpoint_; }
+    std::filesystem::path tmp_dir() const { return checkpoint_file_; }
+    std::filesystem::path kmer_dir() const { return kmer_dir_; }
+
+    void set_kmer_dir(const std::filesystem::path &kmer_dir) { kmer_dir_ = kmer_dir; }
+
+    void set_checkpoint(uint32_t checkpoint) { checkpoint_ = checkpoint; }
+
+    void done() { std::filesystem::remove(checkpoint_file_); }
+
+    void store() const;
+
+  private:
+    bool enabled_;
+    uint32_t phase_;
+    uint32_t checkpoint_;
+    std::filesystem::path checkpoint_file_;
+    std::filesystem::path kmer_dir_;
+};
+
+} // namespace boss
+} // namespace graph
+} // namespace mtg
diff --git a/metagraph/src/kmer/kmer_collector.cpp b/metagraph/src/kmer/kmer_collector.cpp
index e10533700c..bb380d29ed 100644
--- a/metagraph/src/kmer/kmer_collector.cpp
+++ b/metagraph/src/kmer/kmer_collector.cpp
@@ -148,7 +148,8 @@ ::KmerCollector(size_t k,
     buffer_size_ = memory_preallocated / sizeof(typename Container::value_type);
 
     if constexpr(utils::is_instance_v<Data, common::ChunkedWaitQueue>) {
-        tmp_dir_ = utils::create_temp_dir(swap_dir, "kmers");
+        // don't clean up tmp_dir_ on exit so that we can resume computation
+        tmp_dir_ = utils::create_temp_dir(swap_dir, "kmers", false);
         kmers_ = std::make_unique<Container>(num_threads, buffer_size_,
                                              tmp_dir_, max_disk_space);
     } else {

From 7d53b5290d75f6678a79f5a60c49a74b8ca482b7 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 8 Sep 2020 17:51:08 +0200
Subject: [PATCH 08/51] Added functional tests

---
 metagraph/integration_tests/test_build.py     | 33 +++++++++++++++++
 .../integration_tests/test_build_weighted.py  | 36 +++++++++++++++++++
 metagraph/src/cli/config/config.cpp           | 10 +++---
 metagraph/src/common/utils/file_utils.cpp     |  4 +--
 .../succinct/build_checkpoint.cpp             |  5 +++
 .../succinct/build_checkpoint.hpp             |  2 +-
 metagraph/src/kmer/kmer_collector.cpp         |  7 ----
 metagraph/src/kmer/kmer_collector.hpp         |  2 --
 8 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index 216603f757..176e17d650 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -337,6 +337,39 @@ def test_build_chunks_from_kmc_canonical(self, build):
         self.assertEqual('nodes (k): 802920', params_str[1])
         self.assertEqual('canonical mode: yes', params_str[2])
 
+    @parameterized.expand(['succinct_disk'])
+    def test_build_phase(self, build):
+        representation, tmp_dir = build_params[build]
+
+        construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+                            '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
+            exe=METAGRAPH,
+            repr=representation,
+            tmp_dir=tmp_dir,
+            outfile=self.tempdir.name + '/graph',
+            input=TEST_DATA_DIR + '/transcripts_1000.fa'
+        )
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+        self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
+
+        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
+                            '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
+            exe=METAGRAPH,
+            repr=representation,
+            tmp_dir=tmp_dir,
+            outfile=self.tempdir.name + '/graph',
+            input=TEST_DATA_DIR + '/transcripts_1000.fa'
+        )
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+
+        res = self.__get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
+        self.assertEqual(res.returncode, 0)
+        params_str = res.stdout.decode().split('\n')[2:]
+        self.assertEqual('k: 20', params_str[0])
+        self.assertEqual('nodes (k): 1159851', params_str[1])
+        self.assertEqual('canonical mode: yes', params_str[2])
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/metagraph/integration_tests/test_build_weighted.py b/metagraph/integration_tests/test_build_weighted.py
index d90a79d2fc..53d6525505 100644
--- a/metagraph/integration_tests/test_build_weighted.py
+++ b/metagraph/integration_tests/test_build_weighted.py
@@ -373,5 +373,41 @@ def test_kmer_count_width_large(self, build, k_width_result):
         self.assertEqual('avg weight: {}'.format(avg_count_expected), params_str[4])
 
 
+    def test_build_phase(self):
+        construct_command = '{exe} build --phase 1 --mask-dummy -k 20 --count-kmers --disk-swap {tmp_dir} ' \
+                            '--count-width 16 -o {outfile} {input}'.format(
+            exe=METAGRAPH,
+            tmp_dir=self.tempdir.name,
+            outfile=self.tempdir.name + '/graph',
+            input=TEST_DATA_DIR + '/transcripts_1000.fa'
+        )
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+        self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
+
+        construct_command = '{exe} build --mask-dummy -k 20 --count-kmers --disk-swap {tmp_dir} --count-width 16 ' \
+                            '-o {outfile} {input}'.format(
+            exe=METAGRAPH,
+            tmp_dir=self.tempdir.name,
+            outfile=self.tempdir.name + '/graph',
+            input=TEST_DATA_DIR + '/transcripts_1000.fa'
+        )
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+
+        stats_command = '{exe} stats {graph}'.format(
+            exe=METAGRAPH,
+            graph=self.tempdir.name + '/graph.dbg',
+        )
+        res = subprocess.run(stats_command.split(), stdout=PIPE)
+        self.assertEqual(res.returncode, 0)
+        params_str = res.stdout.decode().split('\n')[2:]
+        self.assertEqual('k: 20', params_str[0])
+        self.assertEqual('nodes (k): 591997', params_str[1])
+        self.assertEqual('canonical mode: no', params_str[2])
+        self.assertEqual('nnz weights: 591997', params_str[3])
+        self.assertEqual('avg weight: 2.48587', params_str[4])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp
index 6206720f04..9fe93ba994 100644
--- a/metagraph/src/cli/config/config.cpp
+++ b/metagraph/src/cli/config/config.cpp
@@ -524,19 +524,21 @@ Config::Config(int argc, char *argv[]) {
         print_usage_and_exit = true;
 
     if (identity != BUILD && phase != 2) {
-        std::cerr << "Error: Phases are only supported for building. Remove --phase.";
+        std::cerr << "Error: Phases are only supported for building. Remove --phase"
+                  << std::endl;
         print_usage_and_exit = true;
     }
 
     if (phase != 2 && tmp_dir.empty()) {
         std::cerr << "Error: Phases are only supported for disk-based building. "
-                     "Please set --disk-swap.";
+                     "Please set --disk-swap." << std::endl;
         print_usage_and_exit = true;
     }
 
     if (phase != 2 && suffix_len > 0) {
-        std::cerr << "Error: Phases are not supported for multiple suffixes. "
-                     "Remove --phase or specify each suffix separately using --suffix";
+        std::cerr << "Error: Phases are not supported for multiple suffixes. Remove "
+                     "--phase or specify each suffix separately using --suffix"
+                  << std::endl;
         print_usage_and_exit = true;
     }
 
diff --git a/metagraph/src/common/utils/file_utils.cpp b/metagraph/src/common/utils/file_utils.cpp
index 91f964999c..23b003d526 100644
--- a/metagraph/src/common/utils/file_utils.cpp
+++ b/metagraph/src/common/utils/file_utils.cpp
@@ -49,6 +49,8 @@ std::filesystem::path create_temp_dir(std::filesystem::path path,
         exit(1);
     }
 
+    logger->trace("Created temporary directory {}", tmp_dir_str);
+
     if (!clean_on_exit) {
         return tmp_dir_str;
     }
@@ -62,8 +64,6 @@ std::filesystem::path create_temp_dir(std::filesystem::path path,
             logger->error("Couldn't reset the atexit handler");
     }
 
-    logger->trace("Registered temporary directory {}", tmp_dir_str);
-
     static std::mutex mu;
     std::lock_guard<std::mutex> lock(mu);
 
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
index a5d22f93d1..3f512ad6fb 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
@@ -37,6 +37,11 @@ BuildCheckpoint::BuildCheckpoint(bool enabled,
     }
 }
 
+void BuildCheckpoint::done() const {
+    std::filesystem::remove(checkpoint_file_);
+    std::filesystem::remove_all(kmer_dir_);
+}
+
 void BuildCheckpoint::store() const {
     if (!enabled_) {
         return;
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index e21d6b37c9..acbb713f08 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -23,7 +23,7 @@ class BuildCheckpoint {
 
     void set_checkpoint(uint32_t checkpoint) { checkpoint_ = checkpoint; }
 
-    void done() { std::filesystem::remove(checkpoint_file_); }
+    void done() const;
 
     void store() const;
 
diff --git a/metagraph/src/kmer/kmer_collector.cpp b/metagraph/src/kmer/kmer_collector.cpp
index bb380d29ed..28366ff5d5 100644
--- a/metagraph/src/kmer/kmer_collector.cpp
+++ b/metagraph/src/kmer/kmer_collector.cpp
@@ -161,13 +161,6 @@ ::KmerCollector(size_t k,
             kmers_->buffer_size());
 }
 
-template <typename KMER, class KmerExtractor, class Container>
-KmerCollector<KMER, KmerExtractor, Container>
-::~KmerCollector() {
-    if (!tmp_dir_.empty())
-        std::filesystem::remove_all(tmp_dir_);
-}
-
 template <typename KMER, class KmerExtractor, class Container>
 void KmerCollector<KMER, KmerExtractor, Container>
 ::add_sequences(const std::function<void(CallString)> &generate_sequences) {
diff --git a/metagraph/src/kmer/kmer_collector.hpp b/metagraph/src/kmer/kmer_collector.hpp
index 8e7cb9b989..0228bfc2ef 100644
--- a/metagraph/src/kmer/kmer_collector.hpp
+++ b/metagraph/src/kmer/kmer_collector.hpp
@@ -61,8 +61,6 @@ class KmerCollector {
                   size_t max_disk_space = 1e9,
                   bool canonical_only = false);
 
-    ~KmerCollector();
-
     inline size_t get_k() const { return k_; }
 
     inline size_t suffix_length() const { return filter_suffix_encoded_.size(); }

From 7476e7ac4f97c8cf85aebefef3c10dc4d06e59b9 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 8 Sep 2020 18:48:56 +0200
Subject: [PATCH 09/51] Small changes self review

---
 metagraph/src/cli/config/config.hpp           |  2 +-
 .../succinct/boss_chunk_construct.cpp         | 19 ++++---------------
 .../succinct/boss_chunk_construct.hpp         |  2 --
 .../succinct/build_checkpoint.cpp             |  5 +++++
 .../succinct/build_checkpoint.hpp             |  7 ++++---
 5 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp
index c3b97c38c3..cea0a31ffd 100644
--- a/metagraph/src/cli/config/config.hpp
+++ b/metagraph/src/cli/config/config.hpp
@@ -142,7 +142,7 @@ class Config {
 
     size_t disk_cap_bytes = 20e9; // 20GB default
 
-    uint32_t phase = 2;
+    uint32_t phase = 2; // build phase; 1 = generate kmers, 2 = complete build
 
     enum IdentityType {
         NO_IDENTITY = -1,
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 6ef36a2664..9734e585f1 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -385,7 +385,6 @@ std::vector<std::string> split(size_t k,
     logger->trace("Total number of real k-mers: {}", num_kmers);
 
     checkpoint->set_checkpoint(3);
-    checkpoint->store();
 
     return names;
 }
@@ -444,7 +443,6 @@ concatenate_chunks(const std::filesystem::path &dir,
     }
 
     checkpoint->set_checkpoint(5);
-    checkpoint->store();
     return { real_split_by_W, dummy_sink_name };
 }
 
@@ -531,16 +529,15 @@ generate_dummy_1_kmers(size_t k,
                 if (KMER_REAL::compare_suffix(top, dummy_source, 1)) {
                     // The source dummy k-mer #dummy_source generated from #it is
                     // redundant iff it shares its suffix with another real k-mer (#top).
-                    // In this case, #top generates a dummy sink k-mer redundant with
-                    // #it. So if #dummy_source is redundant, the sink generated from
-                    // #top is also redundant - so it's being skipped
+                    // In this case, #top generates a dummy sink k-mer redundant with #it.
+                    // So if #dummy_source is redundant, the sink generated from #top is
+                    // also redundant - so it's being skipped
                     skip_same_suffix(top, sink_gen_it, 1);
                     continue;
                 }
             }
             // lift all and reset the first character to the sentinel 0 (apply mask)
-            dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1)
-                                   + kmer_delta);
+            dummy_l1_chunks[F].add(kmer::transform<KMER>(dummy_source, k + 1) + kmer_delta);
             num_source++;
         }
         // handle leftover sink_gen_it
@@ -560,7 +557,6 @@ generate_dummy_1_kmers(size_t k,
     logger->trace("Generated {} dummy sink and {} dummy source k-mers", num_sink,
                   num_source);
     checkpoint->set_checkpoint(4);
-    checkpoint->store();
 
     return { dummy_sink_names, real_F_W };
 }
@@ -651,7 +647,6 @@ void add_reverse_complements(size_t k,
         rc_set->clear(dir, false /* don't delete chunk files! */);
         original.finish();
         checkpoint->set_checkpoint(2);
-        checkpoint->store();
     }
 
     // start merging #original with #reverse_complements into #kmers
@@ -693,7 +688,6 @@ template <class KmerCollector, typename T_REAL, typename T>
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
         checkpoint->set_checkpoint(1);
-        checkpoint->store();
     }
 
     size_t k = kmer_collector.get_k() - 1;
@@ -795,7 +789,6 @@ template <class KmerCollector, typename T_REAL, typename T>
         }
 
         checkpoint->set_checkpoint(6);
-        checkpoint->store();
     } else {
         logger->info("Skipping generating dummy-1..{} source k-mers", k);
     }
@@ -922,10 +915,6 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         kmer_collector_.add_sequences(std::move(sequences));
     }
 
-    std::filesystem::path tmp_dir() const override {
-        return kmer_collector_.tmp_dir();
-    }
-
     template <typename KMER, typename T, typename Container>
     BOSS::Chunk *build_chunk_2bit(Container &kmers) {
         logger->trace("Reconstructing all required dummy source k-mers...");
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index e5958a4ea1..f9b7b003ef 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -33,8 +33,6 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                const BuildCheckpoint& checkpoint = BuildCheckpoint(false, "", 2));
 
     virtual uint64_t get_k() const = 0;
-
-    virtual std::filesystem::path tmp_dir() const = 0;
 };
 
 } // namespace boss
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
index 3f512ad6fb..2a4bb9601a 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
@@ -37,6 +37,11 @@ BuildCheckpoint::BuildCheckpoint(bool enabled,
     }
 }
 
+void BuildCheckpoint::set_checkpoint(uint32_t checkpoint) {
+    checkpoint_ = checkpoint;
+    store();
+}
+
 void BuildCheckpoint::done() const {
     std::filesystem::remove(checkpoint_file_);
     std::filesystem::remove_all(kmer_dir_);
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index acbb713f08..c7247c11c7 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -8,7 +8,8 @@ namespace graph {
 namespace boss {
 
 /**
- * Stores checkpointing information for resuming disk-based building of succinct graphs.
+ * Stores checkpointing and phase information for resuming disk-based building of succinct
+ * graphs.
  */
 class BuildCheckpoint {
   public:
@@ -21,13 +22,13 @@ class BuildCheckpoint {
 
     void set_kmer_dir(const std::filesystem::path &kmer_dir) { kmer_dir_ = kmer_dir; }
 
-    void set_checkpoint(uint32_t checkpoint) { checkpoint_ = checkpoint; }
+    void set_checkpoint(uint32_t checkpoint);
 
     void done() const;
 
+  private:
     void store() const;
 
-  private:
     bool enabled_;
     uint32_t phase_;
     uint32_t checkpoint_;

From fa07b12a12efe28542aaf6cce8948e314c958675 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Wed, 9 Sep 2020 08:45:43 +0200
Subject: [PATCH 10/51] Add integration tests for parallel building

---
 metagraph/integration_tests/test_build.py | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index 176e17d650..f9592c40d7 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -371,5 +371,42 @@ def test_build_phase(self, build):
         self.assertEqual('nodes (k): 1159851', params_str[1])
         self.assertEqual('canonical mode: yes', params_str[2])
 
+    # tests that we can build and resume 2 separate graphs on the same machine
+    @parameterized.expand(['succinct_disk'])
+    def test_build_phase_parallel(self, build):
+        representation, tmp_dir = build_params[build]
+
+        for name in ('graph1', 'graph2'):
+            construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+                                '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
+                exe=METAGRAPH,
+                repr=representation,
+                tmp_dir=tmp_dir,
+                outfile=self.tempdir.name + '/' + name,
+                input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
+            )
+            res = subprocess.run([construct_command], shell=True)
+            self.assertEqual(res.returncode, 0)
+            self.assertTrue(os.path.isfile(self.tempdir.name + '/' + name + '.checkpoint'))
+
+        for name in ('graph', 'graph2'):
+            construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
+                                '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
+                exe=METAGRAPH,
+                repr=representation,
+                tmp_dir=tmp_dir,
+                outfile=self.tempdir.name + '/' + name,
+                input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
+            )
+            res = subprocess.run([construct_command], shell=True)
+            self.assertEqual(res.returncode, 0)
+
+            res = self.__get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
+            self.assertEqual(res.returncode, 0)
+            params_str = res.stdout.decode().split('\n')[2:]
+            self.assertEqual('k: 20', params_str[0])
+            self.assertEqual('nodes (k): ' + ('1159851' if name == 'graph1' else '91584'), params_str[1])
+            self.assertEqual('canonical mode: yes', params_str[2])
+
 if __name__ == '__main__':
     unittest.main()

From 8301c2262c7836452cdb9e99568cf767147a0cb6 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 10 Sep 2020 09:48:48 +0200
Subject: [PATCH 11/51] Remove forgotten optnone

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 9734e585f1..9660e35ade 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -673,7 +673,7 @@ void add_reverse_complements(size_t k,
  * the dummy-k kmers, for k=2..k
  */
 template <class KmerCollector, typename T_REAL, typename T>
-[[clang::optnone]] void recover_dummy_nodes(const KmerCollector &kmer_collector,
+void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          ChunkedWaitQueue<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
                          ThreadPool &async_worker,

From 71ac989d801cc528bc2d1de61f7da90951bb61cf Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 10 Sep 2020 09:54:40 +0200
Subject: [PATCH 12/51] Minor rename

---
 .../graph/representation/succinct/boss_chunk_construct.cpp  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 9660e35ade..5a08f4e47f 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -684,7 +684,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
-    uint32_t previous_phase = checkpoint->checkpoint();
+    uint32_t previous_checkpoint = checkpoint->checkpoint();
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
         checkpoint->set_checkpoint(1);
@@ -694,9 +694,9 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     const std::filesystem::path dir = checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
 
-    if (previous_phase == 1) {
+    if (previous_checkpoint == 1) {
         logger->info(
-                "Continuing from checkpoint phase 1. Looking for chunk_* files in {}",
+                "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
         std::vector<std::string> file_names;
         for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir())) {

From 12e1ada9ecf7abf11a930b58072d77467af598a5 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 10 Sep 2020 11:46:55 +0200
Subject: [PATCH 13/51] Support filesystem

---
 metagraph/src/cli/build.cpp                   |  6 +++++-
 .../succinct/boss_chunk_construct.cpp         | 19 +++++++++++++------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index de6ce38b8f..60fa3f22b5 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -129,9 +129,13 @@ int build_graph(Config *config) {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
+            if (checkpoint.phase() == 1) {
+                return 0; // phase 1 stops after collecting k-mers
+            }
+
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
 
-            if (checkpoint.phase() < 2) { // phase 1 stops after generating dummy k-mers
+            if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
                 return 0;
             }
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 5a08f4e47f..c553c240e0 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -699,12 +699,19 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
         std::vector<std::string> file_names;
-        for (const auto &path : std::filesystem::directory_iterator(checkpoint->kmer_dir())) {
-            if (path.is_regular_file()
-                && path.path().filename().string().find("chunk_", 0) == 0
-                && path.path().filename().extension() == "") {
-                logger->trace("Found chunk: {}", path.path().string());
-                file_names.push_back(path.path().string());
+        namespace fs = std::filesystem;
+        for (const auto & entry : fs::directory_iterator(checkpoint->kmer_dir())) {
+            if (!entry.is_directory()
+                && fs::canonical(entry.path()).filename().string().rfind("temp_kmers") != 0) {
+                continue;
+            }
+            for (const auto &path : fs::directory_iterator(entry)) {
+                if (path.is_regular_file()
+                    && path.path().filename().string().find("chunk_", 0) == 0
+                    && path.path().filename().extension() == "") {
+                    logger->trace("Found chunk: {}", path.path().string());
+                    file_names.push_back(path.path().string());
+                }
             }
         }
         if (file_names.empty()) {

From da1aea695c801cf430db8aae160ecafc48653c9e Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 09:16:09 +0200
Subject: [PATCH 14/51] Mor elogging

---
 .../graph/representation/succinct/boss_chunk_construct.cpp    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index c553c240e0..672b3ff07b 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -701,8 +701,10 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         std::vector<std::string> file_names;
         namespace fs = std::filesystem;
         for (const auto & entry : fs::directory_iterator(checkpoint->kmer_dir())) {
+            logger->trace("Checking {}", fs::canonical(entry.path()).filename().string());
             if (!entry.is_directory()
-                && fs::canonical(entry.path()).filename().string().rfind("temp_kmers") != 0) {
+                || fs::canonical(entry.path()).filename().string().rfind("temp_kmers") != 0) {
+                logger->trace("Not ok!");
                 continue;
             }
             for (const auto &path : fs::directory_iterator(entry)) {

From bc63e50499fb78a0ab11a12b04483f52fd3f2cac Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 09:24:03 +0200
Subject: [PATCH 15/51] small

---
 .../graph/representation/succinct/boss_chunk_construct.cpp   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 672b3ff07b..f903e0cd56 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -701,13 +701,14 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         std::vector<std::string> file_names;
         namespace fs = std::filesystem;
         for (const auto & entry : fs::directory_iterator(checkpoint->kmer_dir())) {
-            logger->trace("Checking {}", fs::canonical(entry.path()).filename().string());
+            logger->trace("Checking {}", fs::canonical(entry.path()).string());
             if (!entry.is_directory()
                 || fs::canonical(entry.path()).filename().string().rfind("temp_kmers") != 0) {
                 logger->trace("Not ok!");
                 continue;
             }
-            for (const auto &path : fs::directory_iterator(entry)) {
+            for (const auto &path : fs::directory_iterator(entry.path())) {
+                logger->trace("Checking {}", entry.path().string());
                 if (path.is_regular_file()
                     && path.path().filename().string().find("chunk_", 0) == 0
                     && path.path().filename().extension() == "") {

From 0756a426423203af0018a1ef5524c73cc728b801 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 09:29:47 +0200
Subject: [PATCH 16/51] small

---
 .../graph/representation/succinct/boss_chunk_construct.cpp  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index f903e0cd56..2f75caa81f 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -708,10 +708,10 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 continue;
             }
             for (const auto &path : fs::directory_iterator(entry.path())) {
-                logger->trace("Checking {}", entry.path().string());
+                logger->trace("Checking {}", path.path().string());
                 if (path.is_regular_file()
-                    && path.path().filename().string().find("chunk_", 0) == 0
-                    && path.path().filename().extension() == "") {
+                    && fs::canonical(path.path()).filename().string().find("chunk_", 0) == 0
+                    && fs::canonical(path.path().filename()).extension() == "") {
                     logger->trace("Found chunk: {}", path.path().string());
                     file_names.push_back(path.path().string());
                 }

From a2e6c0f469302a30737ae3227b1d2ec07e4124d8 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 09:49:39 +0200
Subject: [PATCH 17/51] small

---
 .../representation/succinct/boss_chunk_construct.cpp      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 2f75caa81f..832539be44 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -708,10 +708,12 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 continue;
             }
             for (const auto &path : fs::directory_iterator(entry.path())) {
-                logger->trace("Checking {}", path.path().string());
+                logger->trace("Checking2 {}", path.path().string());
+                logger->trace("Checking3 {}", path.path().filename().string());
+                logger->trace("Checking4 {}", path.path().filename().extension());
                 if (path.is_regular_file()
-                    && fs::canonical(path.path()).filename().string().find("chunk_", 0) == 0
-                    && fs::canonical(path.path().filename()).extension() == "") {
+                    && path.path().filename().string().find("chunk_", 0) == 0
+                    && path.path().filename().extension() == "") {
                     logger->trace("Found chunk: {}", path.path().string());
                     file_names.push_back(path.path().string());
                 }

From 15b42da7ec0523c8354ec74936efccb143e4748e Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 10:01:28 +0200
Subject: [PATCH 18/51] Don't clean up unmerged files

---
 metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp   | 6 ++----
 .../graph/representation/succinct/boss_chunk_construct.cpp  | 3 ---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index 2ba2603723..aba1ee9a75 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -49,10 +49,8 @@ class SortedSetDiskBase {
                       size_t max_disk_space_bytes);
 
     virtual ~SortedSetDiskBase() {
-        // remove the files that have not been requested to merge
-        for (const auto &chunk_file : get_file_names()) {
-            std::filesystem::remove(chunk_file);
-        }
+        // not cleaning up unmerged chunk_*** files so that the computation can be resumed
+        // if building in phases or in case of a crash
         async_worker_.join(); // make sure the data was processed
     }
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 832539be44..1a3d4bf2f6 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -708,9 +708,6 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 continue;
             }
             for (const auto &path : fs::directory_iterator(entry.path())) {
-                logger->trace("Checking2 {}", path.path().string());
-                logger->trace("Checking3 {}", path.path().filename().string());
-                logger->trace("Checking4 {}", path.path().filename().extension());
                 if (path.is_regular_file()
                     && path.path().filename().string().find("chunk_", 0) == 0
                     && path.path().filename().extension() == "") {

From 8a9121b3fb8f1d75ed4cfbc43286f8f9bc23074a Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 10:01:59 +0200
Subject: [PATCH 19/51] Don't clean up unmerged files in SortedSetDisk, so that
 computation can be resumed

---
 metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index 2ba2603723..aba1ee9a75 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -49,10 +49,8 @@ class SortedSetDiskBase {
                       size_t max_disk_space_bytes);
 
     virtual ~SortedSetDiskBase() {
-        // remove the files that have not been requested to merge
-        for (const auto &chunk_file : get_file_names()) {
-            std::filesystem::remove(chunk_file);
-        }
+        // not cleaning up unmerged chunk_*** files so that the computation can be resumed
+        // if building in phases or in case of a crash
         async_worker_.join(); // make sure the data was processed
     }
 

From 9fabdb574c6c258f7ec8dddaba21a9ba11def233 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 15 Sep 2020 10:20:57 +0200
Subject: [PATCH 20/51] 10 chunks

---
 metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index aba1ee9a75..a573472627 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -181,7 +181,7 @@ class SortedSetDiskBase {
     uint32_t merged_all_count_ = 0;
 
     /** Number of chunks for "level 1" intermediary merging. */
-    static constexpr uint32_t MERGE_L1_COUNT = 4;
+    static constexpr uint32_t MERGE_L1_COUNT = 10;
 
     static std::string merged_l1_name(const std::string &prefix, uint32_t count) {
         return prefix + "m" + std::to_string(count);

From a4fb98cedcef9f36491c60c9c1a2efaa37d0a8d8 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 17 Sep 2020 09:40:31 +0200
Subject: [PATCH 21/51] Wait for merging before stopping

---
 metagraph/src/cli/build.cpp                              | 4 ----
 metagraph/src/cli/parse_sequences.hpp                    | 2 +-
 .../representation/succinct/boss_chunk_construct.cpp     | 9 +++++++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 60fa3f22b5..b400bfb43f 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -129,10 +129,6 @@ int build_graph(Config *config) {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
-            if (checkpoint.phase() == 1) {
-                return 0; // phase 1 stops after collecting k-mers
-            }
-
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
 
             if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
diff --git a/metagraph/src/cli/parse_sequences.hpp b/metagraph/src/cli/parse_sequences.hpp
index 7a5bc739d5..aa3aca860a 100644
--- a/metagraph/src/cli/parse_sequences.hpp
+++ b/metagraph/src/cli/parse_sequences.hpp
@@ -98,7 +98,7 @@ void parse_sequences(const std::string &file,
     } else if (file_format(file) == "FASTA"
                 || file_format(file) == "FASTQ") {
 
-        if (std::filesystem::exists(utils::remove_suffix(file, ".gz", ".fasta") + ".kmer_counts.gz")) {
+        if (false && std::filesystem::exists(utils::remove_suffix(file, ".gz", ".fasta") + ".kmer_counts.gz")) {
 
             mtg::common::logger->trace("Parsing k-mer counts from '{}'",
                 utils::remove_suffix(file, ".gz", ".fasta") + ".kmer_counts.gz"
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 1a3d4bf2f6..0dfdc389fc 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -684,6 +684,11 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
+    if (checkpoint->phase() == 1) {
+        kmers.reset();
+        return; // phase 1 stops after collecting k-mers
+    }
+
     uint32_t previous_checkpoint = checkpoint->checkpoint();
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
@@ -932,8 +937,8 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE);
         recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
-        if (checkpoint_.phase() == 1) {
-            logger->info("Finished building phase 1");
+        if (checkpoint_.phase() <= 2) {
+            logger->info("Finished building phase {}", checkpoint_.phase());
             queue.reset();
             return nullptr;
         }

From be4d76719f23d36aa83a612ee582b68232e7ba8f Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 17 Sep 2020 09:59:34 +0200
Subject: [PATCH 22/51] Small fix

---
 metagraph/src/cli/build.cpp                               | 2 +-
 .../representation/succinct/boss_chunk_construct.cpp      | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index b400bfb43f..a74f03d81b 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -131,7 +131,7 @@ int build_graph(Config *config) {
 
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
 
-            if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
+            if (checkpoint.phase() <= 2) { // phase 2 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
                 return 0;
             }
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 0dfdc389fc..b954eba6d4 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -931,14 +931,18 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
 
     template <typename KMER, typename T, typename Container>
     BOSS::Chunk *build_chunk_2bit(Container &kmers) {
+        if (checkpoint_.phase() == 1) {
+            logger->info("Finished building phase 1");
+            return nullptr;
+        }
         logger->trace("Reconstructing all required dummy source k-mers...");
 
         Timer timer;
         ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE);
         recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
-        if (checkpoint_.phase() <= 2) {
-            logger->info("Finished building phase {}", checkpoint_.phase());
+        if (checkpoint_.phase() == 2) {
+            logger->info("Finished building phase 2");
             queue.reset();
             return nullptr;
         }

From 89d1588f402b09fe14377b87ca3e08a8838faa22 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 17 Sep 2020 10:50:04 +0200
Subject: [PATCH 23/51] Actually wait for merge to happen

---
 metagraph/src/cli/build.cpp                            |  8 +++++++-
 .../src/common/sorted_sets/sorted_set_disk_base.hpp    |  1 +
 .../representation/succinct/boss_chunk_construct.cpp   | 10 ----------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index a74f03d81b..79ef5fe6fe 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -129,9 +129,15 @@ int build_graph(Config *config) {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
+            if (checkpoint.phase() == 1) {
+                logger->info("Finished building phase 1");
+                return 0;
+            }
+
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
 
-            if (checkpoint.phase() <= 2) { // phase 2 stops after generating dummy k-mers
+            if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
+                logger->info("Finished building phase 2");
                 assert(next_chunk == nullptr);
                 return 0;
             }
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index a573472627..4ca69dc1bf 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -51,6 +51,7 @@ class SortedSetDiskBase {
     virtual ~SortedSetDiskBase() {
         // not cleaning up unmerged chunk_*** files so that the computation can be resumed
         // if building in phases or in case of a crash
+        async_merge_l1_.join(); // don't leave half-merged chunks behind
         async_worker_.join(); // make sure the data was processed
     }
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index b954eba6d4..dc49634167 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -684,11 +684,6 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
-    if (checkpoint->phase() == 1) {
-        kmers.reset();
-        return; // phase 1 stops after collecting k-mers
-    }
-
     uint32_t previous_checkpoint = checkpoint->checkpoint();
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
@@ -931,10 +926,6 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
 
     template <typename KMER, typename T, typename Container>
     BOSS::Chunk *build_chunk_2bit(Container &kmers) {
-        if (checkpoint_.phase() == 1) {
-            logger->info("Finished building phase 1");
-            return nullptr;
-        }
         logger->trace("Reconstructing all required dummy source k-mers...");
 
         Timer timer;
@@ -942,7 +933,6 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
         if (checkpoint_.phase() == 2) {
-            logger->info("Finished building phase 2");
             queue.reset();
             return nullptr;
         }

From 4c73921eebf12258a6addc7da6fadb910f573b05 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Fri, 18 Sep 2020 10:32:04 +0200
Subject: [PATCH 24/51] Write checkpoint after phase1

---
 metagraph/src/annotation/annotation_converters.cpp | 1 +
 metagraph/src/cli/build.cpp                        | 2 ++
 metagraph/src/cli/config/config.cpp                | 8 ++++----
 metagraph/src/cli/config/config.hpp                | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp
index 14898aa594..b9910d8289 100644
--- a/metagraph/src/annotation/annotation_converters.cpp
+++ b/metagraph/src/annotation/annotation_converters.cpp
@@ -978,5 +978,6 @@ void convert_to_row_annotator(const ColumnCompressed<std::string> &source,
                               RowCompressed<std::string> *annotator,
                               size_t num_threads);
 
+
 } // namespace annot
 } // namespace mtg
diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 79ef5fe6fe..00d5be8d57 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -130,6 +130,8 @@ int build_graph(Config *config) {
             }
 
             if (checkpoint.phase() == 1) {
+                checkpoint.set_kmer_dir(config->tmp_dir);
+                checkpoint.set_checkpoint(1);
                 logger->info("Finished building phase 1");
                 return 0;
             }
diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp
index 9fe93ba994..0a85665cd2 100644
--- a/metagraph/src/cli/config/config.cpp
+++ b/metagraph/src/cli/config/config.cpp
@@ -523,19 +523,19 @@ Config::Config(int argc, char *argv[]) {
     if (identity == COMPARE && fnames.size() != 2)
         print_usage_and_exit = true;
 
-    if (identity != BUILD && phase != 2) {
+    if (identity != BUILD && phase != 3) {
         std::cerr << "Error: Phases are only supported for building. Remove --phase"
                   << std::endl;
         print_usage_and_exit = true;
     }
 
-    if (phase != 2 && tmp_dir.empty()) {
+    if (phase != 3 && tmp_dir.empty()) {
         std::cerr << "Error: Phases are only supported for disk-based building. "
                      "Please set --disk-swap." << std::endl;
         print_usage_and_exit = true;
     }
 
-    if (phase != 2 && suffix_len > 0) {
+    if (phase != 3 && suffix_len > 0) {
         std::cerr << "Error: Phases are not supported for multiple suffixes. Remove "
                      "--phase or specify each suffix separately using --suffix"
                   << std::endl;
@@ -772,7 +772,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) {
             fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
             fprintf(stderr, "\t   --disk-swap [STR] \tdirectory to use for temporary files [off]\n");
             fprintf(stderr, "\t   --disk-cap-gb [INT] \tmax temp disk space to use before forcing a merge, in GB [20]\n");
-            fprintf(stderr, "\t   --phase [INT] \tmax where to stop the computation (1=generate kmers, 2= build all) [2]\n");
+            fprintf(stderr, "\t   --phase [INT] \tmax where to stop the computation (1=collect kmers, 2=generate kmers, 3=build all) [3]\n");
         } break;
         case CLEAN: {
             fprintf(stderr, "Usage: %s clean -o <outfile-base> [options] GRAPH\n\n", prog_name.c_str());
diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp
index cea0a31ffd..8b3f26244e 100644
--- a/metagraph/src/cli/config/config.hpp
+++ b/metagraph/src/cli/config/config.hpp
@@ -142,7 +142,7 @@ class Config {
 
     size_t disk_cap_bytes = 20e9; // 20GB default
 
-    uint32_t phase = 2; // build phase; 1 = generate kmers, 2 = complete build
+    uint32_t phase = 3; // 1 = collect kmers, 2 = generate kmers, 3 = complete build
 
     enum IdentityType {
         NO_IDENTITY = -1,

From 6b0b9d9048ef842748b5fe1a774b159e410eb95f Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sat, 19 Sep 2020 10:48:37 +0200
Subject: [PATCH 25/51] Address review comments

---
 metagraph/integration_tests/test_build.py     |  2 ++
 metagraph/src/cli/build.cpp                   |  5 ++--
 metagraph/src/cli/config/config.cpp           |  7 ++++-
 metagraph/src/common/elias_fano.cpp           |  1 +
 .../succinct/boss_chunk_construct.cpp         |  8 ++++--
 .../succinct/boss_chunk_construct.hpp         |  2 +-
 .../succinct/build_checkpoint.cpp             | 26 +++++++------------
 .../succinct/build_checkpoint.hpp             | 13 +++++-----
 8 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index f9592c40d7..aa64a116e1 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -370,6 +370,7 @@ def test_build_phase(self, build):
         self.assertEqual('k: 20', params_str[0])
         self.assertEqual('nodes (k): 1159851', params_str[1])
         self.assertEqual('canonical mode: yes', params_str[2])
+        self.assertFalse(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
     # tests that we can build and resume 2 separate graphs on the same machine
     @parameterized.expand(['succinct_disk'])
@@ -407,6 +408,7 @@ def test_build_phase_parallel(self, build):
             self.assertEqual('k: 20', params_str[0])
             self.assertEqual('nodes (k): ' + ('1159851' if name == 'graph1' else '91584'), params_str[1])
             self.assertEqual('canonical mode: yes', params_str[2])
+            self.assertFalse(os.path.isfile(self.tempdir.name + '/' + name + '.checkpoint'))
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index de6ce38b8f..3546ea6d9e 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -107,7 +107,7 @@ int build_graph(Config *config) {
             }
 
             bool checkpoint_enabled = !config->tmp_dir.empty() && suffixes.size() == 1;
-            boss::BuildCheckpoint checkpoint(checkpoint_enabled, config->outfbase,
+            boss::BuildCheckpoint checkpoint(checkpoint_enabled ? config->outfbase : "",
                                              config->phase);
             auto constructor = boss::IBOSSChunkConstructor::initialize(
                 boss_graph->get_k(),
@@ -135,6 +135,7 @@ int build_graph(Config *config) {
                 assert(next_chunk == nullptr);
                 return 0;
             }
+
             logger->trace("Graph chunk with {} k-mers was built in {} sec",
                           next_chunk->size() - 1, timer.elapsed());
             if (config->suffix.size()) {
@@ -153,7 +154,7 @@ int build_graph(Config *config) {
             } else {
                 graph_data.reset(next_chunk);
             }
-            checkpoint.done();
+            checkpoint.remove_checkpoint();
         }
 
         assert(graph_data);
diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp
index 9fe93ba994..12be3a8252 100644
--- a/metagraph/src/cli/config/config.cpp
+++ b/metagraph/src/cli/config/config.cpp
@@ -529,6 +529,11 @@ Config::Config(int argc, char *argv[]) {
         print_usage_and_exit = true;
     }
 
+    if (identity != BUILD && phase > 2) {
+        std::cerr << "Error: Invalid phase value. Can be either 1 or 2." << std::endl;
+        print_usage_and_exit = true;
+    }
+
     if (phase != 2 && tmp_dir.empty()) {
         std::cerr << "Error: Phases are only supported for disk-based building. "
                      "Please set --disk-swap." << std::endl;
@@ -772,7 +777,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) {
             fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
             fprintf(stderr, "\t   --disk-swap [STR] \tdirectory to use for temporary files [off]\n");
             fprintf(stderr, "\t   --disk-cap-gb [INT] \tmax temp disk space to use before forcing a merge, in GB [20]\n");
-            fprintf(stderr, "\t   --phase [INT] \tmax where to stop the computation (1=generate kmers, 2= build all) [2]\n");
+            fprintf(stderr, "\t   --phase [INT] \tmax where to stop the computation (1=generate kmers, 2=build all) [2]\n");
         } break;
         case CLEAN: {
             fprintf(stderr, "Usage: %s clean -o <outfile-base> [options] GRAPH\n\n", prog_name.c_str());
diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp
index f8a6170e5f..006ac1ec99 100644
--- a/metagraph/src/common/elias_fano.cpp
+++ b/metagraph/src/common/elias_fano.cpp
@@ -17,6 +17,7 @@ namespace common {
 std::vector<std::string> concat(const std::vector<std::string> &files, const std::string &result) {
     if (files.empty())
         return {};
+
     std::vector<std::string> original_files;
 
     std::vector<std::string> suffixes = { "", ".up" };
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 5a08f4e47f..2bc4155e68 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -349,7 +349,7 @@ template <typename T_REAL>
 std::vector<std::string> split(size_t k,
                                const std::filesystem::path &dir,
                                const ChunkedWaitQueue<T_REAL> &kmers,
-                               BuildCheckpoint* checkpoint) {
+                               BuildCheckpoint *checkpoint) {
     using T_INT_REAL = get_int_t<T_REAL>;
 
     const uint8_t alphabet_size = KmerExtractor2Bit().alphabet.size();
@@ -362,6 +362,7 @@ std::vector<std::string> split(size_t k,
         names[i] = dir/("real_F_W_" + std::to_string(i));
     }
 
+    assert(checkpoint->checkpoint() >= 2);
     if (checkpoint->checkpoint() > 2) {
         logger->info("Skipping splitting k-mers into chunks");
         return names;
@@ -413,6 +414,7 @@ concatenate_chunks(const std::filesystem::path &dir,
         real_split_by_W[W] = dir/("real_split_by_W_" + std::to_string(W));
     }
 
+    assert(checkpoint->checkpoint() >= 4);
     if (checkpoint->checkpoint() > 4) {
         return { real_split_by_W, dummy_sink_name };
     }
@@ -478,6 +480,7 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_names[i] = dir/("dummy_sink_" + std::to_string(i));
     }
 
+    assert(checkpoint->checkpoint() >= 3);
     if (checkpoint->checkpoint() > 3) {
         logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
         return { dummy_sink_names, real_F_W };
@@ -572,6 +575,7 @@ void add_reverse_complements(size_t k,
                              ThreadPool& async_worker,
                              ChunkedWaitQueue<T_REAL> *kmers,
                              BuildCheckpoint *checkpoint) {
+    assert(checkpoint->checkpoint() >= 1);
     if (checkpoint->checkpoint() > 2) {
         logger->info("Skipping generating reverse complements");
         return;
@@ -607,7 +611,7 @@ void add_reverse_complements(size_t k,
                     checkpoint->kmer_dir());
             std::exit(1);
         }
-    } else { //  checkpoint->checkpoint() < 2
+    } else { //  checkpoint->checkpoint() == 1
         std::string rc_dir = dir/"rc";
         std::filesystem::create_directory(rc_dir);
         rc_set = std::make_unique<common::SortedSetDisk<T_INT_REAL>>(
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index f9b7b003ef..3437c36ea9 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -30,7 +30,7 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                mtg::kmer::ContainerType container_type = mtg::kmer::ContainerType::VECTOR,
                const std::filesystem::path &swap_dir = "/tmp/",
                size_t max_disk_space_bytes = 1e9,
-               const BuildCheckpoint& checkpoint = BuildCheckpoint(false, "", 2));
+               const BuildCheckpoint &checkpoint = BuildCheckpoint("", 2));
 
     virtual uint64_t get_k() const = 0;
 };
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
index 2a4bb9601a..717833e76c 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
@@ -6,17 +6,15 @@ namespace mtg {
 namespace graph {
 namespace boss {
 
-BuildCheckpoint::BuildCheckpoint(bool enabled,
-                                 const std::filesystem::path &output_prefix,
+BuildCheckpoint::BuildCheckpoint(const std::filesystem::path &output_prefix,
                                  uint32_t phase)
-    : enabled_(enabled),
+    : enabled_(!output_prefix.empty()),
       phase_(phase),
       checkpoint_(0),
       checkpoint_file_(output_prefix.string() + ".checkpoint") {
-    if (!enabled_) {
+    if (output_prefix.empty()) {
         return;
     }
-    assert(!output_prefix.empty());
     if (std::filesystem::exists(checkpoint_file_)) {
         std::ifstream f(checkpoint_file_);
         f >> checkpoint_;
@@ -39,18 +37,9 @@ BuildCheckpoint::BuildCheckpoint(bool enabled,
 
 void BuildCheckpoint::set_checkpoint(uint32_t checkpoint) {
     checkpoint_ = checkpoint;
-    store();
-}
-
-void BuildCheckpoint::done() const {
-    std::filesystem::remove(checkpoint_file_);
-    std::filesystem::remove_all(kmer_dir_);
-}
-
-void BuildCheckpoint::store() const {
-    if (!enabled_) {
+    if (!enabled_)
         return;
-    }
+
     std::ofstream f(checkpoint_file_);
     f << checkpoint_ << std::endl;
     if (checkpoint_ > 0) {
@@ -59,6 +48,11 @@ void BuildCheckpoint::store() const {
     f.close();
 }
 
+void BuildCheckpoint::remove_checkpoint() const {
+    std::filesystem::remove(checkpoint_file_);
+    std::filesystem::remove_all(kmer_dir_);
+}
+
 } // namespace boss
 } // namespace graph
 } // namespace mtg
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index c7247c11c7..4b4e0d4d81 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -13,18 +13,19 @@ namespace boss {
  */
 class BuildCheckpoint {
   public:
-    BuildCheckpoint(bool enabled, const std::filesystem::path &output_prefix, uint32_t phase);
+    BuildCheckpoint(const std::filesystem::path &output_prefix, uint32_t phase);
 
     uint32_t phase() const { return phase_; }
+
     uint32_t checkpoint() const { return checkpoint_; }
-    std::filesystem::path tmp_dir() const { return checkpoint_file_; }
-    std::filesystem::path kmer_dir() const { return kmer_dir_; }
+    void set_checkpoint(uint32_t checkpoint);
 
-    void set_kmer_dir(const std::filesystem::path &kmer_dir) { kmer_dir_ = kmer_dir; }
+    const std::filesystem::path& tmp_dir() const { return checkpoint_file_; }
 
-    void set_checkpoint(uint32_t checkpoint);
+    const std::filesystem::path& kmer_dir() const { return kmer_dir_; }
+    void set_kmer_dir(const std::filesystem::path &kmer_dir) { kmer_dir_ = kmer_dir; }
 
-    void done() const;
+    void remove_checkpoint() const;
 
   private:
     void store() const;

From caaf272c038856b4e975123439c80f86bd5876d4 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sat, 19 Sep 2020 10:51:56 +0200
Subject: [PATCH 26/51] Address missed comments

---
 .../representation/succinct/boss_chunk_construct.cpp   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 2bc4155e68..68d23329c6 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -681,14 +681,14 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                          ChunkedWaitQueue<T_REAL> &kmers,
                          ChunkedWaitQueue<T> *kmers_out,
                          ThreadPool &async_worker,
-                         BuildCheckpoint* checkpoint) {
+                         BuildCheckpoint *checkpoint) {
     using KMER_REAL = get_first_type_t<T_REAL>; // 64/128/256-bit KmerBOSS on 2 bits
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_REAL or <KMER_REAL, count>
 
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
-    uint32_t previous_checkpoint = checkpoint->checkpoint();
+    uint32_t last_checkpoint = checkpoint->checkpoint();
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
         checkpoint->set_checkpoint(1);
@@ -698,7 +698,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     const std::filesystem::path dir = checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
 
-    if (previous_checkpoint == 1) {
+    if (last_checkpoint == 1) {
         logger->info(
                 "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
@@ -891,7 +891,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
                          double memory_preallocated,
                          const std::filesystem::path &tmp_dir,
                          size_t max_disk_space,
-                         const BuildCheckpoint& checkpoint)
+                         const BuildCheckpoint &checkpoint)
         : kmer_collector_(k + 1,
                           both_strands_mode,
                           encode_filter_suffix_boss(filter_suffix),
@@ -928,7 +928,7 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
         if (checkpoint_.phase() == 1) {
-            logger->info("Finished building phase 1");
+            logger->info("Phase 1 finished");
             queue.reset();
             return nullptr;
         }

From 62ec66953f77b0b0199be5daaabd3bc01efed123 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Sep 2020 20:50:38 +0200
Subject: [PATCH 27/51] First checkpoint, then delete

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 0e1be4e2c6..99ee2c12f6 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -440,11 +440,12 @@ concatenate_chunks(const std::filesystem::path &dir,
         to_delete.insert(to_delete.end(), original.begin(), original.end());
     }
 
+    checkpoint->set_checkpoint(5);
+
     for (const auto &name : to_delete) {
         std::filesystem::remove(name);
     }
 
-    checkpoint->set_checkpoint(5);
     return { real_split_by_W, dummy_sink_name };
 }
 

From 9cf55a9655dd1e97777d761f775c356521560a1f Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Sep 2020 20:50:38 +0200
Subject: [PATCH 28/51] First checkpoint, then delete

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 68d23329c6..73756973ab 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -440,11 +440,12 @@ concatenate_chunks(const std::filesystem::path &dir,
         to_delete.insert(to_delete.end(), original.begin(), original.end());
     }
 
+    checkpoint->set_checkpoint(5);
+
     for (const auto &name : to_delete) {
         std::filesystem::remove(name);
     }
 
-    checkpoint->set_checkpoint(5);
     return { real_split_by_W, dummy_sink_name };
 }
 

From 2792ab079ae850c5e5bec7b398a1d12da9125f73 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 12 Oct 2020 13:40:43 +0200
Subject: [PATCH 29/51] Added some logging

---
 metagraph/src/cli/build.cpp                                    | 3 +++
 .../src/graph/representation/succinct/boss_chunk_construct.cpp | 1 +
 2 files changed, 4 insertions(+)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 3546ea6d9e..2588d335be 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -133,6 +133,9 @@ int build_graph(Config *config) {
 
             if (checkpoint.phase() < 2) { // phase 1 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
+                logger->info(
+                        "Phase 1 successfully finished. Remove '--phase 1' from your "
+                        "command line to finish building the graph");
                 return 0;
             }
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 73756973ab..d4e2fae4e9 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -416,6 +416,7 @@ concatenate_chunks(const std::filesystem::path &dir,
 
     assert(checkpoint->checkpoint() >= 4);
     if (checkpoint->checkpoint() > 4) {
+        logger->info("Skipping concatenating chunks...");
         return { real_split_by_W, dummy_sink_name };
     }
 

From 7600830df8837db2b5b380c4876d9bdd095f6444 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 12 Oct 2020 14:04:50 +0200
Subject: [PATCH 30/51] Addressed review comments

---
 .../succinct/build_checkpoint.hpp             | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index 4b4e0d4d81..8dfb07493d 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -10,6 +10,33 @@ namespace boss {
 /**
  * Stores checkpointing and phase information for resuming disk-based building of succinct
  * graphs.
+ * In order to assist building large graphs, which can take days or even weeks to
+ * construct, metagraph checkpoints the computation at certain stages in order to allow
+ * continuing an interrupted computation (e.g. bc of running out of memory, or disk
+ * failure, etc.).
+ * Currently, metagraph checkpoints after the following operations:
+ * 1. Collecting k-mers (de-duping, and sorting into chunks)
+ * 2. Computing reverse complements (if built with `--canonical`)
+ * 3. Splitting into 16 chunks (for computing dummy k-mers in parallel)
+ * 4. Computing dummy-1 source and dummy sink k-mers
+ * 5. Concatenate dummy sink and real k-mers
+ * 6. Generate dummy-2+ source k-mers
+ * Checkpoints 1 and 6 are exposed to the client as phase 1 and 2, so that users can
+ * break down graph construction into stages. The following two phases are currently
+ * supported:
+ * Phase 1: kmer collection. This phase stops after having collected, sorted and de-duped
+ * k-mers from all the input files. If you have a large amount of input files, you can
+ * shard them into n chunks, and process each chunk on a different machine (doesn't need
+ * much RAM or CPU), making sure you specify the same (empty) temporary directory
+ * using --disk-swap.
+ * Phase 2: kmer generation. This phase stops after having generated reverse complements
+ * (if --canonical is present) and dummy source and sink k-mers. This needs to run on a
+ * single machine, but doesn't need much RAM.
+ * Phase 3: graph generation. This is the final phase and consists of processing the
+ * k-mers and generating the BOSS table. This phase requires enough RAM to fit the BOSS
+ * table, which is 0.6 * kmer_count bytes. If using `--count-kmers`, additional memory
+ * to store the count for each k-mer is needed. For `--count-width 16` (2 bytes), the
+ * memory used by the BOSS table is 2.6 * kmer_count.
  */
 class BuildCheckpoint {
   public:

From c4c43ece60fba403de0fa9bb45a053827407d1c6 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 12 Oct 2020 17:13:19 +0200
Subject: [PATCH 31/51] minor

---
 .../src/graph/representation/succinct/build_checkpoint.hpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index 8dfb07493d..3381dbd57e 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -22,7 +22,7 @@ namespace boss {
  * 5. Concatenate dummy sink and real k-mers
  * 6. Generate dummy-2+ source k-mers
  * Checkpoints 1 and 6 are exposed to the client as phase 1 and 2, so that users can
- * break down graph construction into stages. The following two phases are currently
+ * break down graph construction into stages. The following three phases are currently
  * supported:
  * Phase 1: kmer collection. This phase stops after having collected, sorted and de-duped
  * k-mers from all the input files. If you have a large amount of input files, you can

From 866e7861310d039c74798740652886f9305365d7 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 19 Oct 2020 14:04:15 +0200
Subject: [PATCH 32/51] Remove double declaration

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 31b312c275..7ba03660cc 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -494,8 +494,6 @@ generate_dummy_1_kmers(size_t k,
     }
 
     logger->trace("Generating dummy-1 source k-mers and dummy sink k-mers...");
-    uint64_t num_sink = 0;
-    uint64_t num_source = 0;
 
     static constexpr size_t L = KMER::kBitsPerChar;
     KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);

From 5a3a8a3a9aade85119034c32c9301302387266bb Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 22 Oct 2020 14:53:52 +0200
Subject: [PATCH 33/51] Default to phase 3

---
 metagraph/src/graph/representation/succinct/boss_chunk.cpp      | 1 +
 .../src/graph/representation/succinct/boss_chunk_construct.hpp  | 2 +-
 metagraph/tests/annotation/test_annotated_dbg.cpp               | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk.cpp b/metagraph/src/graph/representation/succinct/boss_chunk.cpp
index 1341d06a9e..92c0880fee 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk.cpp
@@ -254,6 +254,7 @@ void BOSS::Chunk::extend(const BOSS::Chunk &other) {
 }
 
 void BOSS::Chunk::initialize_boss(BOSS *graph, sdsl::int_vector<> *weights) {
+    std::cout << "Liast " << last_.size() << " w " << W_.size() << std::endl;
     assert(last_.size() == W_.size());
     assert(weights_.size() == 0 || weights_.size() == W_.size());
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
index 3437c36ea9..fe67dfe03a 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.hpp
@@ -30,7 +30,7 @@ class IBOSSChunkConstructor : public IGraphChunkConstructor<BOSS::Chunk> {
                mtg::kmer::ContainerType container_type = mtg::kmer::ContainerType::VECTOR,
                const std::filesystem::path &swap_dir = "/tmp/",
                size_t max_disk_space_bytes = 1e9,
-               const BuildCheckpoint &checkpoint = BuildCheckpoint("", 2));
+               const BuildCheckpoint &checkpoint = BuildCheckpoint("", 3));
 
     virtual uint64_t get_k() const = 0;
 };
diff --git a/metagraph/tests/annotation/test_annotated_dbg.cpp b/metagraph/tests/annotation/test_annotated_dbg.cpp
index e7362db08f..946ec14299 100644
--- a/metagraph/tests/annotation/test_annotated_dbg.cpp
+++ b/metagraph/tests/annotation/test_annotated_dbg.cpp
@@ -1840,6 +1840,7 @@ TYPED_TEST(AnnotatedDBGNoNTest, get_top_labels) {
 }
 
 TEST(AnnotatedDBG, score_kmer_presence_mask) {
+    common::logger->set_level(spdlog::level::trace);
     auto anno_graph = build_anno_graph<DBGSuccinct>(31, {}, {});
     std::vector<std::pair<sdsl::bit_vector, int32_t>> results {
        { sdsl::bit_vector(), 0},

From ad6d965970ccc95bba929172e9972f725dae0e7a Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Thu, 22 Oct 2020 18:57:40 +0200
Subject: [PATCH 34/51] Skip phase 2 if no rc's are generated

---
 metagraph/src/graph/representation/succinct/boss_chunk.cpp      | 1 -
 .../src/graph/representation/succinct/boss_chunk_construct.cpp  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk.cpp b/metagraph/src/graph/representation/succinct/boss_chunk.cpp
index 92c0880fee..1341d06a9e 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk.cpp
@@ -254,7 +254,6 @@ void BOSS::Chunk::extend(const BOSS::Chunk &other) {
 }
 
 void BOSS::Chunk::initialize_boss(BOSS *graph, sdsl::int_vector<> *weights) {
-    std::cout << "Liast " << last_.size() << " w " << W_.size() << std::endl;
     assert(last_.size() == W_.size());
     assert(weights_.size() == 0 || weights_.size() == W_.size());
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 7ba03660cc..21df847364 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -692,7 +692,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     uint32_t last_checkpoint = checkpoint->checkpoint();
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
-        checkpoint->set_checkpoint(1);
+        checkpoint->set_checkpoint(kmer_collector.is_both_strands_mode() ? 1 : 2);
     }
 
     size_t k = kmer_collector.get_k() - 1;

From 6493adbd3697a10717211d25e961da50f03715cf Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Fri, 23 Oct 2020 15:14:16 +0200
Subject: [PATCH 35/51] Flush sorted set at end of phase

---
 metagraph/integration_tests/test_build.py     | 17 ++++----
 metagraph/src/cli/build.cpp                   | 16 ++++----
 .../sorted_sets/sorted_set_disk_base.cpp      | 14 ++++---
 .../sorted_sets/sorted_set_disk_base.hpp      |  3 ++
 .../succinct/boss_chunk_construct.cpp         | 41 +++++++++++++------
 metagraph/src/kmer/kmer_collector.hpp         |  2 +
 6 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index aa64a116e1..bbf2369545 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -3,7 +3,6 @@
 import subprocess
 from subprocess import PIPE
 from tempfile import TemporaryDirectory
-import glob
 import os
 
 
@@ -44,7 +43,7 @@ def test_simple_all_graphs(self, build):
         construct_command = '{exe} build --mask-dummy --graph {repr} --disk-swap {tmp_dir} -k 20 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -341,11 +340,11 @@ def test_build_chunks_from_kmc_canonical(self, build):
     def test_build_phase(self, build):
         representation, tmp_dir = build_params[build]
 
-        construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+        construct_command = '{exe} build --phase 2 --mask-dummy --graph {repr} --canonical -k 20 ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -353,11 +352,11 @@ def test_build_phase(self, build):
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
+        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 -v ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -378,11 +377,11 @@ def test_build_phase_parallel(self, build):
         representation, tmp_dir = build_params[build]
 
         for name in ('graph1', 'graph2'):
-            construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+            construct_command = '{exe} build --phase 2 --mask-dummy --graph {repr} --canonical -k 20 ' \
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir=tmp_dir,
+                tmp_dir='' if tmp_dir == '' else self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )
@@ -395,7 +394,7 @@ def test_build_phase_parallel(self, build):
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir=tmp_dir,
+                tmp_dir='' if tmp_dir == '' else self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )
diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 75c7e5182f..bdfe66662f 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -129,21 +129,19 @@ int build_graph(Config *config) {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
+            boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
+
+            // this needs to e called after #build_chunk, because the SortedSetDisk needs
+            // to be flushed
             if (checkpoint.phase() == 1) {
-                checkpoint.set_kmer_dir(config->tmp_dir);
-                checkpoint.set_checkpoint(1);
-                logger->info("Finished building phase 1");
+                assert(next_chunk == nullptr);
+                logger->info("Phase 1 successfully finished.");
                 return 0;
             }
 
-            boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
-
             if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
-                logger->info("Finished building phase 2");
                 assert(next_chunk == nullptr);
-                logger->info(
-                        "Phase 1 successfully finished. Remove '--phase 1' from your "
-                        "command line to finish building the graph");
+                logger->info("Phase 2 successfully finished.");
                 return 0;
             }
 
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
index 87678abe20..81c330132b 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
@@ -34,11 +34,7 @@ ChunkedWaitQueue<T>& SortedSetDiskBase<T>::data(bool free_buffer) {
 
     if (!is_merging_) {
         is_merging_ = true;
-        // write any residual data left
-        if (!data_.empty()) {
-            sort_and_dedupe();
-            dump_to_file(true /* is_done */);
-        }
+        flush(); // write any residual data left
         if (free_buffer) {
             Vector<T>().swap(data_); // free up the (usually very large) buffer
         }
@@ -51,6 +47,14 @@ ChunkedWaitQueue<T>& SortedSetDiskBase<T>::data(bool free_buffer) {
     return merge_queue_;
 }
 
+template <typename T>
+void SortedSetDiskBase<T>::flush() {
+    if (!data_.empty()) {
+        sort_and_dedupe();
+        dump_to_file(true /* is_done */);
+    }
+}
+
 template <typename T>
 std::vector<std::string> SortedSetDiskBase<T>::files_to_merge() {
     std::unique_lock<std::mutex> exclusive_lock(mutex_);
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index 4ca69dc1bf..e667540e80 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -63,6 +63,9 @@ class SortedSetDiskBase {
      */
     ChunkedWaitQueue<T>& data(bool free_buffer = true);
 
+    /** Flushes the unwritten buffers to disk. */
+    void flush();
+
     /**
      * Returns the files to be merged - useful if the caller prefers to do the merging.
      */
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 21df847364..aa2c944046 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -493,7 +493,7 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
     }
 
-    logger->trace("Generating dummy-1 source k-mers and dummy sink k-mers...");
+    logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
 
     static constexpr size_t L = KMER::kBitsPerChar;
     KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
@@ -705,14 +705,22 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 checkpoint->kmer_dir());
         std::vector<std::string> file_names;
         namespace fs = std::filesystem;
-        for (const auto & entry : fs::directory_iterator(checkpoint->kmer_dir())) {
-            logger->trace("Checking {}", fs::canonical(entry.path()).string());
-            if (!entry.is_directory()
-                || fs::canonical(entry.path()).filename().string().rfind("temp_kmers") != 0) {
-                logger->trace("Not ok!");
-                continue;
+        std::vector<fs::path> entries;
+        if (fs::canonical(checkpoint->kmer_dir()).filename().string().rfind("temp_kmers") == 0) {
+            entries.push_back(checkpoint->kmer_dir());
+        } else {
+            for (const auto & entry : fs::directory_iterator(checkpoint->kmer_dir())) {
+                if (entry.is_directory()
+                    && fs::canonical(entry.path()).filename().string().rfind("temp_kmers")
+                            == 0) {
+                    entries.push_back(entry.path());
+                    continue;
+                }
             }
-            for (const auto &path : fs::directory_iterator(entry.path())) {
+        }
+        for (const fs::path & entry : entries) {
+            logger->trace("Adding chunks in  {}", fs::canonical(entry));
+            for (const auto &path : fs::directory_iterator(entry)) {
                 if (path.is_regular_file()
                     && path.path().filename().string().find("chunk_", 0) == 0
                     && path.path().filename().extension() == "") {
@@ -767,7 +775,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
 
     if (checkpoint->checkpoint() < 6) {
         // generate dummy k-mers of prefix length 1..k
-        logger->trace("Starting generating dummy-1..{} source k-mers...", k);
+        logger->info("Starting generating dummy-1..{} source k-mers...", k);
         for (size_t dummy_pref_len = 1; dummy_pref_len < k; ++dummy_pref_len) {
 
             std::vector<Encoder<KMER_INT>> next_chunks;
@@ -931,14 +939,14 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
 
     template <typename KMER, typename T, typename Container>
     BOSS::Chunk *build_chunk_2bit(Container &kmers) {
-        logger->trace("Reconstructing all required dummy source k-mers...");
-
         Timer timer;
         ChunkedWaitQueue<utils::replace_first_t<KMER, T>> queue(ENCODER_BUFFER_SIZE);
+
+        logger->trace("Reconstructing all required dummy source k-mers...");
         recover_dummy_nodes(kmer_collector_, kmers, &queue, async_worker_, &checkpoint_);
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
+
         if (checkpoint_.phase() == 2) {
-            logger->info("Phase 2 finished");
             queue.reset();
             return nullptr;
         }
@@ -949,6 +957,15 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
     }
 
     BOSS::Chunk* build_chunk() override {
+        if constexpr(utils::is_instance_v<typename KmerCollector::Data, ChunkedWaitQueue>) {
+            if (checkpoint_.phase() == 1) {
+                kmer_collector_.kmers().flush();
+                checkpoint_.set_kmer_dir(kmer_collector_.tmp_dir());
+                checkpoint_.set_checkpoint(1);
+                return nullptr;
+            }
+        }
+
         BOSS::Chunk *result;
         typename KmerCollector::Data &kmer_ints = kmer_collector_.data();
 
diff --git a/metagraph/src/kmer/kmer_collector.hpp b/metagraph/src/kmer/kmer_collector.hpp
index 0228bfc2ef..be0f6219d3 100644
--- a/metagraph/src/kmer/kmer_collector.hpp
+++ b/metagraph/src/kmer/kmer_collector.hpp
@@ -92,6 +92,8 @@ class KmerCollector {
     //      Use reinterpret_cast to cast them back to k-mers.
     inline Data& data() { join(); return kmers_->data(); }
 
+    inline Container& kmers() { join(); return *kmers_; }
+
     void clear() { join(); kmers_->clear(); }
 
     inline bool is_both_strands_mode() const { return both_strands_mode_; }

From 57c111c87bb94d72fc53f5570b4ed196fb816547 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sat, 24 Oct 2020 17:06:47 +0200
Subject: [PATCH 36/51] Better temp dir

---
 metagraph/integration_tests/test_build.py | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index bbf2369545..841ec46b73 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -43,7 +43,7 @@ def test_simple_all_graphs(self, build):
         construct_command = '{exe} build --mask-dummy --graph {repr} --disk-swap {tmp_dir} -k 20 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -65,7 +65,7 @@ def test_simple_bloom_graph(self, build):
         construct_command = '{exe} build --mask-dummy --graph {repr} --disk-swap {tmp_dir} -k 20 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -110,7 +110,7 @@ def test_simple_all_graphs_canonical(self, build):
                 --graph {repr} --canonical -k 20 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -175,7 +175,7 @@ def test_build_from_kmc(self, build):
         construct_command = '{exe} build --mask-dummy --graph {repr} --disk-swap {tmp_dir} -k 11 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters.kmc_suf'
         )
@@ -197,7 +197,7 @@ def test_build_from_kmc_both(self, build):
         construct_command = '{exe} build --mask-dummy --graph {repr} --disk-swap {tmp_dir} -k 11 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters_both_strands.kmc_suf'
         )
@@ -220,7 +220,7 @@ def test_build_from_kmc_canonical(self, build):
                 --graph {repr} --disk-swap {tmp_dir} --canonical -k 11 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters.kmc_suf'
         )
@@ -243,7 +243,7 @@ def test_build_from_kmc_both_canonical(self, build):
                 --graph {repr} --disk-swap {tmp_dir} --canonical -k 11 -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir=tmp_dir,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters_both_strands.kmc_suf'
         )
@@ -268,7 +268,7 @@ def test_build_chunks_from_kmc(self, build):
                                 --graph {repr} -k 11 --suffix {suffix} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir=tmp_dir,
+                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
                 outfile=self.tempdir.name + '/graph',
                 input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters.kmc_suf',
                 suffix=suffix
@@ -307,7 +307,7 @@ def test_build_chunks_from_kmc_canonical(self, build):
                     --suffix {suffix} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir=tmp_dir,
+                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
                 outfile=self.tempdir.name + '/graph',
                 input=TEST_DATA_DIR + '/transcripts_1000_kmc_counters.kmc_suf',
                 suffix=suffix
@@ -344,7 +344,7 @@ def test_build_phase(self, build):
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -356,7 +356,7 @@ def test_build_phase(self, build):
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir='' if tmp_dir == '' else self.tempdir.name,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
@@ -381,7 +381,7 @@ def test_build_phase_parallel(self, build):
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir='' if tmp_dir == '' else self.tempdir.name,
+                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )
@@ -394,7 +394,7 @@ def test_build_phase_parallel(self, build):
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir='' if tmp_dir == '' else self.tempdir.name,
+                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )

From e6c0d9547fce106b2a64394b520cd9b26a6a62b1 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sat, 24 Oct 2020 23:10:36 +0200
Subject: [PATCH 37/51] Don't push kmers into queue if phase is < 3

---
 metagraph/integration_tests/test_build.py      | 18 +++++++++++++++---
 metagraph/src/cli/build.cpp                    | 15 +++++----------
 .../sorted_sets/sorted_set_disk_base.hpp       |  2 +-
 .../succinct/boss_chunk_construct.cpp          |  4 ++++
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index 841ec46b73..2c47424790 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -340,6 +340,18 @@ def test_build_chunks_from_kmc_canonical(self, build):
     def test_build_phase(self, build):
         representation, tmp_dir = build_params[build]
 
+        construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+                            '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
+            exe=METAGRAPH,
+            repr=representation,
+            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
+            outfile=self.tempdir.name + '/graph',
+            input=TEST_DATA_DIR + '/transcripts_1000.fa'
+        )
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+        self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
+
         construct_command = '{exe} build --phase 2 --mask-dummy --graph {repr} --canonical -k 20 ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
@@ -352,7 +364,7 @@ def test_build_phase(self, build):
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 -v ' \
+        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
@@ -389,7 +401,7 @@ def test_build_phase_parallel(self, build):
             self.assertEqual(res.returncode, 0)
             self.assertTrue(os.path.isfile(self.tempdir.name + '/' + name + '.checkpoint'))
 
-        for name in ('graph', 'graph2'):
+        for name in ('graph1', 'graph2'):
             construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
@@ -401,7 +413,7 @@ def test_build_phase_parallel(self, build):
             res = subprocess.run([construct_command], shell=True)
             self.assertEqual(res.returncode, 0)
 
-            res = self.__get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
+            res = self.__get_stats(self.tempdir.name + '/' + name + graph_file_extension[representation])
             self.assertEqual(res.returncode, 0)
             params_str = res.stdout.decode().split('\n')[2:]
             self.assertEqual('k: 20', params_str[0])
diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index bdfe66662f..bfc2be6a63 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -129,24 +129,19 @@ int build_graph(Config *config) {
                 logger->info("Skipping parsing sequences from input file(s)");
             }
 
+            // need to call build_chunk() even if checkpoint.phase()==1, because the
+            // SortedSetDisk needs to be flushed
             boss::BOSS::Chunk *next_chunk = constructor->build_chunk();
 
-            // this needs to e called after #build_chunk, because the SortedSetDisk needs
-            // to be flushed
-            if (checkpoint.phase() == 1) {
+            if (checkpoint.phase() <= 2) { // phase 2 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
-                logger->info("Phase 1 successfully finished.");
-                return 0;
-            }
-
-            if (checkpoint.phase() == 2) { // phase 2 stops after generating dummy k-mers
-                assert(next_chunk == nullptr);
-                logger->info("Phase 2 successfully finished.");
+                logger->info("Phase {} successfully finished.", checkpoint.phase());
                 return 0;
             }
 
             logger->trace("Graph chunk with {} k-mers was built in {} sec",
                           next_chunk->size() - 1, timer.elapsed());
+
             if (config->suffix.size()) {
                 logger->info("Serialize the graph chunk for suffix '{}'...", suffix);
                 timer.reset();
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
index e667540e80..b7e8db1e60 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.hpp
@@ -185,7 +185,7 @@ class SortedSetDiskBase {
     uint32_t merged_all_count_ = 0;
 
     /** Number of chunks for "level 1" intermediary merging. */
-    static constexpr uint32_t MERGE_L1_COUNT = 10;
+    static constexpr uint32_t MERGE_L1_COUNT = 4;
 
     static std::string merged_l1_name(const std::string &prefix, uint32_t count) {
         return prefix + "m" + std::to_string(count);
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index aa2c944046..bd58ff7e0e 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -815,6 +815,10 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         logger->info("Skipping generating dummy-1..{} source k-mers", k);
     }
 
+    if (checkpoint->phase() < 3) {
+        return;
+    }
+
     // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers
     // in dummy_chunks, and we merge them all into a single stream
     kmers_out->reset();

From 80f1e482e4ae16c816df4c2594c382a32a600954 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 23 Nov 2020 12:19:55 +0100
Subject: [PATCH 38/51] Set checkpoint to 2 when RC's are not being generated

---
 .../src/graph/representation/succinct/boss_chunk_construct.cpp  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index bd58ff7e0e..cc1867024a 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -752,6 +752,8 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         // compute the reverse complements of #kmers, then merge back into #kmers
         add_reverse_complements(k, num_threads, kmer_collector.buffer_size(), dir,
                                 async_worker, &kmers, checkpoint);
+    } else {
+        checkpoint->set_checkpoint(2);
     }
 
     auto [dummy_sink_names, real_F_W]

From de1173caf07b3aa22048a5545a3ee5e58bafd8fc Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 23 Nov 2020 13:42:50 +0100
Subject: [PATCH 39/51] Clean up temp files in SSD

---
 metagraph/tests/common/test_sorted_set_disk.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/metagraph/tests/common/test_sorted_set_disk.cpp b/metagraph/tests/common/test_sorted_set_disk.cpp
index 666082e70e..1bd1cadf09 100644
--- a/metagraph/tests/common/test_sorted_set_disk.cpp
+++ b/metagraph/tests/common/test_sorted_set_disk.cpp
@@ -50,6 +50,7 @@ common::SortedSetDisk<T> create_sorted_set_disk(size_t container_size = 8, size_
 }
 
 TYPED_TEST(SortedSetDiskTest, Empty) {
+    std::filesystem::remove_all("./test_chunk_");
     constexpr size_t container_size = 8;
     common::SortedSetDisk<TypeParam> underTest
             = create_sorted_set_disk<TypeParam>(container_size);
@@ -234,6 +235,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Overlap) {
         expected_result.insert(expected_result.end(), elements.begin(), elements.end());
     }
     expect_equals(underTest, expected_result);
+    std::filesystem::remove_all("./test_chunk_");
 }
 
 /**
@@ -252,6 +254,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Distinct) {
     std::vector<TypeParam> expected_result(400);
     std::iota(expected_result.begin(), expected_result.end(), 0);
     expect_equals(underTest, expected_result);
+    std::filesystem::remove_all("./test_chunk_");
 }
 
 /**
@@ -270,6 +273,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Intertwined) {
     std::vector<TypeParam> expected_result(400);
     std::iota(expected_result.begin(), expected_result.end(), 0);
     expect_equals(underTest, expected_result);
+    std::filesystem::remove_all("./test_chunk_");
 }
 
 } // namespace

From 5e42d105e24bb4c266705bc228bed455e792a81a Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 24 Nov 2020 12:50:03 +0100
Subject: [PATCH 40/51] Remove trace logs

---
 metagraph/tests/annotation/test_annotated_dbg.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/metagraph/tests/annotation/test_annotated_dbg.cpp b/metagraph/tests/annotation/test_annotated_dbg.cpp
index 946ec14299..e7362db08f 100644
--- a/metagraph/tests/annotation/test_annotated_dbg.cpp
+++ b/metagraph/tests/annotation/test_annotated_dbg.cpp
@@ -1840,7 +1840,6 @@ TYPED_TEST(AnnotatedDBGNoNTest, get_top_labels) {
 }
 
 TEST(AnnotatedDBG, score_kmer_presence_mask) {
-    common::logger->set_level(spdlog::level::trace);
     auto anno_graph = build_anno_graph<DBGSuccinct>(31, {}, {});
     std::vector<std::pair<sdsl::bit_vector, int32_t>> results {
        { sdsl::bit_vector(), 0},

From d728edde3c2880e10b9951693333ae82e9062110 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Tue, 24 Nov 2020 12:55:12 +0100
Subject: [PATCH 41/51] s/remove/remove_all

---
 .../tests/common/test_sorted_set_disk.cpp     | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/metagraph/tests/common/test_sorted_set_disk.cpp b/metagraph/tests/common/test_sorted_set_disk.cpp
index 834a0e79e3..a757cb6fdf 100644
--- a/metagraph/tests/common/test_sorted_set_disk.cpp
+++ b/metagraph/tests/common/test_sorted_set_disk.cpp
@@ -54,7 +54,7 @@ TYPED_TEST(SortedSetDiskTest, Empty) {
     std::filesystem::path tmp_dir = utils::create_temp_dir("", "test_ssd");
     common::SortedSetDisk<TypeParam> under_test = create_sorted_set_disk<TypeParam>(tmp_dir);
     expect_equals(under_test, {});
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 TYPED_TEST(SortedSetDiskTest, InsertOneElement) {
@@ -64,7 +64,7 @@ TYPED_TEST(SortedSetDiskTest, InsertOneElement) {
     std::array<TypeParam, 1> elements = { 42 };
     under_test.insert(elements.begin(), elements.end());
     expect_equals(under_test, { 42 });
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 TYPED_TEST(SortedSetDiskTest, InsertOneRange) {
@@ -74,7 +74,7 @@ TYPED_TEST(SortedSetDiskTest, InsertOneRange) {
     std::array<TypeParam, 7> elements = { 43, 42, 42, 45, 44, 45, 43 };
     under_test.insert(elements.begin(), elements.end());
     expect_equals(under_test, { 42, 43, 44, 45 });
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -86,7 +86,7 @@ TYPED_TEST(SortedSetDiskTest, OneInsertMultipleFiles) {
     std::vector<TypeParam> elements = { 42, 43, 44, 45 };
     under_test.insert(elements.begin(), elements.end());
     expect_equals(under_test, elements);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -100,7 +100,7 @@ TYPED_TEST(SortedSetDiskTest, OneInsertLargerThanBuffer) {
     std::vector<TypeParam> elements = { 42, 43, 44, 45 };
     under_test.insert(elements.begin(), elements.end());
     expect_equals(under_test, elements);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -123,7 +123,7 @@ TYPED_TEST(SortedSetDiskTest, MultipleInsertMultipleFiles) {
         }
         expect_equals(under_test, expected_result);
     }
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -145,7 +145,7 @@ TYPED_TEST(SortedSetDiskTest, MultipleInsertMultipleFilesNonDistinct) {
                 = { TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3) };
         expect_equals(under_test, expected_result);
     }
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -170,7 +170,7 @@ TYPED_TEST(SortedSetDiskTest, MultipleInsertMultipleFilesMultipleThreads) {
     }
     std::for_each(workers.begin(), workers.end(), [](std::thread &t) { t.join(); });
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -197,7 +197,7 @@ TYPED_TEST(SortedSetDiskTest, MultipleInsertMultipleFilesMultipleThreadsDupes) {
     }
     std::for_each(workers.begin(), workers.end(), [](std::thread &t) { t.join(); });
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -217,7 +217,7 @@ TYPED_TEST(SortedSetDiskTest, DiskExceeded) {
         under_test.insert(elements.begin(), elements.end());
     }
     expect_equals(under_test, elements);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 TYPED_TEST(SortedSetDiskTest, InsertSortedOnly) {
@@ -232,7 +232,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedOnly) {
         expected_result.insert(expected_result.end(), elements.begin(), elements.end());
     }
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -251,7 +251,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Overlap) {
         expected_result.insert(expected_result.end(), elements.begin(), elements.end());
     }
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -270,7 +270,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Distinct) {
     std::vector<TypeParam> expected_result(400);
     std::iota(expected_result.begin(), expected_result.end(), 0);
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 /**
@@ -288,7 +288,7 @@ TYPED_TEST(SortedSetDiskTest, InsertSortedAndInsert_Intertwined) {
     std::vector<TypeParam> expected_result(400);
     std::iota(expected_result.begin(), expected_result.end(), 0);
     expect_equals(under_test, expected_result);
-    std::filesystem::remove(tmp_dir);
+    std::filesystem::remove_all(tmp_dir);
 }
 
 } // namespace

From 03d5b2d55b889e691842a513f5bac5b63123fa85 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 14 Dec 2020 18:26:35 +0100
Subject: [PATCH 42/51] Simplify test_build_phase

---
 metagraph/integration_tests/test_build.py | 28 ++++++-----------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index 2c47424790..f736b9c6bd 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -339,39 +339,26 @@ def test_build_chunks_from_kmc_canonical(self, build):
     @parameterized.expand(['succinct_disk'])
     def test_build_phase(self, build):
         representation, tmp_dir = build_params[build]
-
         construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
-            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
+            tmp_dir=self.tempdir.name,
             outfile=self.tempdir.name + '/graph',
             input=TEST_DATA_DIR + '/transcripts_1000.fa'
         )
+        print(f'Executing phase1: {construct_command}')
         res = subprocess.run([construct_command], shell=True)
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = '{exe} build --phase 2 --mask-dummy --graph {repr} --canonical -k 20 ' \
-                            '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
+        construct_command = construct_command.replace('--phase 1', '--phase 2')
+        print(f'Executing phase2: {construct_command}')
         res = subprocess.run([construct_command], shell=True)
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = '{exe} build --mask-dummy --graph {repr} --canonical -k 20 ' \
-                            '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
+        construct_command = construct_command.replace('--phase 2', '')
         res = subprocess.run([construct_command], shell=True)
         self.assertEqual(res.returncode, 0)
 
@@ -387,13 +374,12 @@ def test_build_phase(self, build):
     @parameterized.expand(['succinct_disk'])
     def test_build_phase_parallel(self, build):
         representation, tmp_dir = build_params[build]
-
         for name in ('graph1', 'graph2'):
             construct_command = '{exe} build --phase 2 --mask-dummy --graph {repr} --canonical -k 20 ' \
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
+                tmp_dir=self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )
@@ -406,7 +392,7 @@ def test_build_phase_parallel(self, build):
                                 '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
                 repr=representation,
-                tmp_dir='""' if tmp_dir == '""' else self.tempdir.name,
+                tmp_dir=self.tempdir.name,
                 outfile=self.tempdir.name + '/' + name,
                 input=TEST_DATA_DIR + ('/transcripts_1000.fa' if name == 'graph1' else '/transcripts_100.fa')
             )

From e4aed78708411aecab359d1e764a46eccdbf3437 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 14 Dec 2020 21:26:57 +0100
Subject: [PATCH 43/51] Small fix in checkpoint continuation

---
 .../representation/succinct/boss_chunk_construct.cpp  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 1fbe6fee16..1a12885c71 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -689,17 +689,20 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     using KMER = get_first_type_t<T>; // 64/128/256-bit KmerBOSS with sentinel $ (on 3 bits)
     using KMER_INT = typename KMER::WordType; // the 64/128/256-bit integer in KMER
 
-    uint32_t last_checkpoint = checkpoint->checkpoint();
+    // if we reached this code, we are obviously at checkpoint 1, but we delay setting the
+    // checkpoint until here, so that we can differentiate between re-starting a build
+    // at checkpoint 1, and the continuation of a build from checkpoint 0 to 1
+    bool stopped_at_phase_one = checkpoint->checkpoint() == 1;
     if (checkpoint->checkpoint() == 0) {
         checkpoint->set_kmer_dir(kmer_collector.tmp_dir());
-        checkpoint->set_checkpoint(kmer_collector.is_both_strands_mode() ? 1 : 2);
+        checkpoint->set_checkpoint(1);
     }
 
     size_t k = kmer_collector.get_k() - 1;
     const std::filesystem::path dir = checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
 
-    if (last_checkpoint == 1) {
+    if (stopped_at_phase_one) {
         logger->info(
                 "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
@@ -752,7 +755,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         // compute the reverse complements of #kmers, then merge back into #kmers
         add_reverse_complements(k, num_threads, kmer_collector.buffer_size(), dir,
                                 async_worker, &kmers, checkpoint);
-    } else {
+    } else if (checkpoint->checkpoint() < 2) {
         checkpoint->set_checkpoint(2);
     }
 

From 82a9e8eb828c8139b924fd15b3aa7b250fa79a1a Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Wed, 16 Dec 2020 09:55:31 +0100
Subject: [PATCH 44/51] Verbose mode for test_build_phase

---
 metagraph/integration_tests/test_build.py                   | 3 ++-
 metagraph/src/cli/build.cpp                                 | 3 ++-
 .../graph/representation/succinct/boss_chunk_construct.cpp  | 3 ++-
 .../src/graph/representation/succinct/build_checkpoint.cpp  | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index f736b9c6bd..0cdab0adc1 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -339,7 +339,8 @@ def test_build_chunks_from_kmc_canonical(self, build):
     @parameterized.expand(['succinct_disk'])
     def test_build_phase(self, build):
         representation, tmp_dir = build_params[build]
-        construct_command = '{exe} build --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
+        #TODO: remove -v
+        construct_command = '{exe} build -v --phase 1 --mask-dummy --graph {repr} --canonical -k 20 ' \
                             '--disk-swap {tmp_dir} -o {outfile} {input}'.format(
             exe=METAGRAPH,
             repr=representation,
diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 8c04ccf816..8df5998e7b 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -136,7 +136,8 @@ int build_graph(Config *config) {
 
             if (checkpoint.phase() <= 2) { // phase 2 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
-                logger->info("Phase {} successfully finished.", checkpoint.phase());
+                logger->info("Phase {} (checkpoint {}) successfully finished.",
+                             checkpoint.phase(), checkpoint.checkpoint());
                 return 0;
             }
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 1a12885c71..76fad071aa 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -727,7 +727,8 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
                 if (path.is_regular_file()
                     && path.path().filename().string().find("chunk_", 0) == 0
                     && path.path().filename().extension() == "") {
-                    logger->trace("Found chunk: {}", path.path().string());
+                    logger->trace("Found chunk: {}, size: {}", path.path().string(),
+                                  std::filesystem::file_size(path));
                     file_names.push_back(path.path().string());
                 }
             }
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
index 717833e76c..1b21bbb0db 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.cpp
@@ -27,9 +27,9 @@ BuildCheckpoint::BuildCheckpoint(const std::filesystem::path &output_prefix,
                 std::exit(1);
             }
             common::logger->info(
-                    "Found an interrupted computation, at phase {}, kmer directory "
-                    "{}. Will attempt to continue - if this is not intended, please "
-                    "remove the checkpoint file {}",
+                    "Found an interrupted computation, at checkpoint {}, kmer "
+                    "directory {}. Will attempt to continue - if this is not intended, "
+                    "please remove the checkpoint file: {}",
                     checkpoint_, kmer_dir_, checkpoint_file_);
         }
     }

From 76c0d1b44ffd64ec726044b1dded77ccfabbbf33 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Wed, 16 Dec 2020 14:11:18 +0100
Subject: [PATCH 45/51] A bit more debugging

---
 metagraph/src/cli/build.cpp                              | 3 ++-
 metagraph/src/common/elias_fano.cpp                      | 4 ++++
 .../src/common/sorted_sets/sorted_set_disk_base.cpp      | 1 +
 .../representation/succinct/boss_chunk_construct.cpp     | 9 ++++-----
 .../graph/representation/succinct/build_checkpoint.hpp   | 4 ++++
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp
index 8df5998e7b..b9e3eff00e 100644
--- a/metagraph/src/cli/build.cpp
+++ b/metagraph/src/cli/build.cpp
@@ -137,7 +137,8 @@ int build_graph(Config *config) {
             if (checkpoint.phase() <= 2) { // phase 2 stops after generating dummy k-mers
                 assert(next_chunk == nullptr);
                 logger->info("Phase {} (checkpoint {}) successfully finished.",
-                             checkpoint.phase(), checkpoint.checkpoint());
+                             checkpoint.phase(),
+                             checkpoint.checkpoint_for_phase(checkpoint.phase()));
                 return 0;
             }
 
diff --git a/metagraph/src/common/elias_fano.cpp b/metagraph/src/common/elias_fano.cpp
index 0e330ba27e..7bfa89d505 100644
--- a/metagraph/src/common/elias_fano.cpp
+++ b/metagraph/src/common/elias_fano.cpp
@@ -213,6 +213,10 @@ EliasFanoEncoder<T>::~EliasFanoEncoder() {
 template <typename T>
 void EliasFanoEncoder<T>::add(T value) {
 #ifndef NDEBUG
+    if (value < last_value_) {
+        logger->trace("Ooops {} {}", last_value_ , value);
+        logger->info("Ooops {} {}", last_value_ , value);
+    }
     assert(value >= last_value_);
 #endif
     value -= offset_;
diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
index 0503f8530f..0f957b529e 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
@@ -56,6 +56,7 @@ void SortedSetDiskBase<T>::flush() {
         sort_and_dedupe();
         dump_to_file(true /* is_done */);
     }
+    async_merge_l1_.join();
 }
 
 template <typename T>
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 76fad071aa..ba5598b938 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -821,14 +821,14 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         logger->info("Skipping generating dummy-1..{} source k-mers", k);
     }
 
-    if (checkpoint->phase() < 3) {
-        return;
-    }
-
     // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers
     // in dummy_chunks, and we merge them all into a single stream
     kmers_out->reset();
 
+    if (checkpoint->phase() < 3) {
+        return;
+    }
+
     // add the main dummy source k-mer
     if constexpr (utils::is_pair_v<T>) {
         kmers_out->push({KMER(0), 0});
@@ -959,7 +959,6 @@ class BOSSChunkConstructor : public IBOSSChunkConstructor {
         logger->trace("Dummy source k-mers were reconstructed in {} sec", timer.elapsed());
 
         if (checkpoint_.phase() == 2) {
-            queue.reset();
             return nullptr;
         }
         return new BOSS::Chunk(KmerExtractorBOSS().alphabet.size(),
diff --git a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
index 3381dbd57e..d61b8e8bbd 100644
--- a/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
+++ b/metagraph/src/graph/representation/succinct/build_checkpoint.hpp
@@ -47,6 +47,10 @@ class BuildCheckpoint {
     uint32_t checkpoint() const { return checkpoint_; }
     void set_checkpoint(uint32_t checkpoint);
 
+    static uint32_t checkpoint_for_phase(uint32_t phase) {
+        return phase <= 1 ? phase : 6;
+    }
+
     const std::filesystem::path& tmp_dir() const { return checkpoint_file_; }
 
     const std::filesystem::path& kmer_dir() const { return kmer_dir_; }

From 2a5b52f6cf38315f0cf275efb44af8ea436b1a81 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Fri, 18 Dec 2020 11:27:56 +0100
Subject: [PATCH 46/51] All trace logs

---
 .../succinct/boss_chunk_construct.cpp         | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index ba5598b938..e008a24dc9 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -364,7 +364,7 @@ std::vector<std::string> split(size_t k,
 
     assert(checkpoint->checkpoint() >= 2);
     if (checkpoint->checkpoint() > 2) {
-        logger->info("Skipping splitting k-mers into chunks");
+        logger->trace("Skipping splitting k-mers into chunks");
         return names;
     }
 
@@ -372,7 +372,7 @@ std::vector<std::string> split(size_t k,
         sinks.emplace_back(names[i], ENCODER_BUFFER_SIZE);
     }
 
-    logger->info("Splitting k-mers into {} chunks...", chunk_count);
+    logger->trace("Splitting k-mers into {} chunks...", chunk_count);
     size_t num_kmers = 0;
     for (auto &it = kmers.begin(); it != kmers.end(); ++it) {
         const T_REAL &kmer = *it;
@@ -416,7 +416,7 @@ concatenate_chunks(const std::filesystem::path &dir,
 
     assert(checkpoint->checkpoint() >= 4);
     if (checkpoint->checkpoint() > 4) {
-        logger->info("Skipping concatenating chunks...");
+        logger->trace("Skipping concatenating chunks...");
         return { real_split_by_W, dummy_sink_name };
     }
 
@@ -484,7 +484,7 @@ generate_dummy_1_kmers(size_t k,
 
     assert(checkpoint->checkpoint() >= 3);
     if (checkpoint->checkpoint() > 3) {
-        logger->info("Skipping generating dummy-1 source k-mers and dummy sink kmers");
+        logger->trace("Skipping generating dummy-1 source k-mers and dummy sink kmers");
         return { dummy_sink_names, real_F_W };
     }
 
@@ -493,7 +493,7 @@ generate_dummy_1_kmers(size_t k,
         dummy_sink_chunks.emplace_back(dummy_sink_names[i], ENCODER_BUFFER_SIZE);
     }
 
-    logger->info("Generating dummy-1 source k-mers and dummy sink k-mers...");
+    logger->trace("Generating dummy-1 source k-mers and dummy sink k-mers...");
 
     static constexpr size_t L = KMER::kBitsPerChar;
     KMER_INT kmer_delta = kmer::get_sentinel_delta<KMER_INT>(L, k + 1);
@@ -578,7 +578,7 @@ void add_reverse_complements(size_t k,
                              BuildCheckpoint *checkpoint) {
     assert(checkpoint->checkpoint() >= 1);
     if (checkpoint->checkpoint() > 2) {
-        logger->info("Skipping generating reverse complements");
+        logger->trace("Skipping generating reverse complements");
         return;
     }
     using T_INT_REAL = get_int_t<T_REAL>; // either KMER_INT or <KMER_INT, count>
@@ -586,7 +586,7 @@ void add_reverse_complements(size_t k,
     std::unique_ptr<common::SortedSetDisk<T_INT_REAL>> rc_set;
     std::vector<std::string> to_merge = { dir/"original" };
     if (checkpoint->checkpoint() == 2) {
-        logger->info(
+        logger->trace(
                 "Continuing from checkpoint phase 2. Looking for 'original' and "
                 "'rc/chunk_*' in {}",
                 checkpoint->kmer_dir());
@@ -621,7 +621,7 @@ void add_reverse_complements(size_t k,
         common::EliasFanoEncoderBuffered<T_INT_REAL> original(dir/"original", ENCODER_BUFFER_SIZE);
         Vector<T_INT_REAL> buffer;
         buffer.reserve(10'000);
-        logger->info("Adding reverse complements...");
+        logger->trace("Adding reverse complements...");
         for (auto &it = kmers->begin(); it != kmers->end(); ++it) {
             const T_REAL &kmer = *it;
             const T_REAL &reverse
@@ -703,7 +703,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     size_t num_threads = kmer_collector.num_threads();
 
     if (stopped_at_phase_one) {
-        logger->info(
+        logger->trace(
                 "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
         std::vector<std::string> file_names;
@@ -781,7 +781,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
 
     if (checkpoint->checkpoint() < 6) {
         // generate dummy k-mers of prefix length 1..k
-        logger->info("Starting generating dummy-1..{} source k-mers...", k);
+        logger->trace("Starting generating dummy-1..{} source k-mers...", k);
         for (size_t dummy_pref_len = 1; dummy_pref_len < k; ++dummy_pref_len) {
 
             std::vector<Encoder<KMER_INT>> next_chunks;
@@ -818,7 +818,7 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
 
         checkpoint->set_checkpoint(6);
     } else {
-        logger->info("Skipping generating dummy-1..{} source k-mers", k);
+        logger->trace("Skipping generating dummy-1..{} source k-mers", k);
     }
 
     // at this point, we have the original k-mers in real_split_by_W, the dummy-x k-mers

From dc278dbb617f8ddb60848f4102a1bc6de4d07f34 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Dec 2020 15:23:52 +0100
Subject: [PATCH 47/51] Reset kmers when continuing from cp 1

---
 .../graph/representation/succinct/boss_chunk_construct.cpp   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index e008a24dc9..90ff3fba79 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -587,7 +587,7 @@ void add_reverse_complements(size_t k,
     std::vector<std::string> to_merge = { dir/"original" };
     if (checkpoint->checkpoint() == 2) {
         logger->trace(
-                "Continuing from checkpoint phase 2. Looking for 'original' and "
+                "Continuing from checkpoint 2. Looking for 'original' and "
                 "'rc/chunk_*' in {}",
                 checkpoint->kmer_dir());
         if (!std::filesystem::exists(checkpoint->kmer_dir()/"original")) {
@@ -742,7 +742,8 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
         }
         kmers.reset();
         async_worker.enqueue([&kmers, file_names = std::move(file_names)]() {
-          auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(kmers);
+            auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(kmers);
+            kmers_int.reset();
             std::function<void(const T_INT_REAL &)> on_new_item
                     = [&kmers_int](const T_INT_REAL &v) { kmers_int.push(v); };
             common::merge_files(file_names, on_new_item, false);

From 98ad5dafc72be101b6b83372804d6b7f47e71acd Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Dec 2020 17:58:47 +0100
Subject: [PATCH 48/51] Skip phase 2

---
 metagraph/integration_tests/test_build.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index 0cdab0adc1..e864549dfc 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -353,13 +353,13 @@ def test_build_phase(self, build):
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = construct_command.replace('--phase 1', '--phase 2')
-        print(f'Executing phase2: {construct_command}')
-        res = subprocess.run([construct_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-        self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
+        # construct_command = construct_command.replace('--phase 1', '--phase 2')
+        # print(f'Executing phase2: {construct_command}')
+        # res = subprocess.run([construct_command], shell=True)
+        # self.assertEqual(res.returncode, 0)
+        # self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = construct_command.replace('--phase 2', '')
+        construct_command = construct_command.replace('--phase 1', '')
         res = subprocess.run([construct_command], shell=True)
         self.assertEqual(res.returncode, 0)
 

From 8c953a6a0f4854116a9ac3b603c42063076d1bc3 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Dec 2020 18:49:16 +0100
Subject: [PATCH 49/51] Copy file names

---
 metagraph/integration_tests/test_build.py            | 12 ++++++------
 .../representation/succinct/boss_chunk_construct.cpp |  6 ++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py
index e864549dfc..0cdab0adc1 100644
--- a/metagraph/integration_tests/test_build.py
+++ b/metagraph/integration_tests/test_build.py
@@ -353,13 +353,13 @@ def test_build_phase(self, build):
         self.assertEqual(res.returncode, 0)
         self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        # construct_command = construct_command.replace('--phase 1', '--phase 2')
-        # print(f'Executing phase2: {construct_command}')
-        # res = subprocess.run([construct_command], shell=True)
-        # self.assertEqual(res.returncode, 0)
-        # self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
+        construct_command = construct_command.replace('--phase 1', '--phase 2')
+        print(f'Executing phase2: {construct_command}')
+        res = subprocess.run([construct_command], shell=True)
+        self.assertEqual(res.returncode, 0)
+        self.assertTrue(os.path.isfile(self.tempdir.name + '/graph.checkpoint'))
 
-        construct_command = construct_command.replace('--phase 1', '')
+        construct_command = construct_command.replace('--phase 2', '')
         res = subprocess.run([construct_command], shell=True)
         self.assertEqual(res.returncode, 0)
 
diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 90ff3fba79..08bb570789 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -701,12 +701,11 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
     size_t k = kmer_collector.get_k() - 1;
     const std::filesystem::path dir = checkpoint->kmer_dir();
     size_t num_threads = kmer_collector.num_threads();
-
+    std::vector<std::string> file_names;
     if (stopped_at_phase_one) {
         logger->trace(
                 "Continuing from checkpoint 1. Looking for chunk_* files in {}",
                 checkpoint->kmer_dir());
-        std::vector<std::string> file_names;
         namespace fs = std::filesystem;
         std::vector<fs::path> entries;
         if (fs::canonical(checkpoint->kmer_dir()).filename().string().rfind("temp_kmers") == 0) {
@@ -741,9 +740,8 @@ void recover_dummy_nodes(const KmerCollector &kmer_collector,
             std::exit(1);
         }
         kmers.reset();
-        async_worker.enqueue([&kmers, file_names = std::move(file_names)]() {
+        async_worker.enqueue([&kmers, file_names]() {
             auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(kmers);
-            kmers_int.reset();
             std::function<void(const T_INT_REAL &)> on_new_item
                     = [&kmers_int](const T_INT_REAL &v) { kmers_int.push(v); };
             common::merge_files(file_names, on_new_item, false);

From c238b2063ea542b1dc00eb98f4403f33afddcc82 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Sun, 20 Dec 2020 21:36:38 +0100
Subject: [PATCH 50/51] Acquire lock when flushing

---
 .../src/common/sorted_sets/sorted_set_disk_base.cpp      | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
index 0f957b529e..343aa0cc69 100644
--- a/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
+++ b/metagraph/src/common/sorted_sets/sorted_set_disk_base.cpp
@@ -37,7 +37,11 @@ ChunkedWaitQueue<T>& SortedSetDiskBase<T>::data(bool free_buffer) {
 
     if (!is_merging_) {
         is_merging_ = true;
-        flush(); // write any residual data left
+        // write any residual data left
+        if (!data_.empty()) {
+            sort_and_dedupe();
+            dump_to_file(true /* is_done */);
+        }
         if (free_buffer) {
             Vector<T>().swap(data_); // free up the (usually very large) buffer
         }
@@ -52,6 +56,8 @@ ChunkedWaitQueue<T>& SortedSetDiskBase<T>::data(bool free_buffer) {
 
 template <typename T>
 void SortedSetDiskBase<T>::flush() {
+    std::unique_lock<std::mutex> exclusive_lock(mutex_);
+    std::unique_lock<std::shared_timed_mutex> multi_insert_lock(multi_insert_mutex_);
     if (!data_.empty()) {
         sort_and_dedupe();
         dump_to_file(true /* is_done */);
@@ -68,6 +74,7 @@ std::vector<std::string> SortedSetDiskBase<T>::files_to_merge() {
         sort_and_dedupe();
         dump_to_file(true /* is_done */);
     }
+    async_merge_l1_.join();
     return get_file_names();
 }
 

From 0d399b73f0a5235191576d82dd4bc7e636cb8266 Mon Sep 17 00:00:00 2001
From: Daniel Danciu <dd@daedalean.ai>
Date: Mon, 21 Dec 2020 08:59:12 +0100
Subject: [PATCH 51/51] Count orig/rc

---
 .../graph/representation/succinct/boss_chunk_construct.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
index 08bb570789..f2491902fb 100644
--- a/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
+++ b/metagraph/src/graph/representation/succinct/boss_chunk_construct.cpp
@@ -622,11 +622,14 @@ void add_reverse_complements(size_t k,
         Vector<T_INT_REAL> buffer;
         buffer.reserve(10'000);
         logger->trace("Adding reverse complements...");
+        uint64_t orig_count = 0, rc_count = 0;
         for (auto &it = kmers->begin(); it != kmers->end(); ++it) {
+            orig_count++;
             const T_REAL &kmer = *it;
             const T_REAL &reverse
                     = rev_comp(k + 1, *it, KmerExtractor2Bit().complement_code());
             if (get_first(kmer) != get_first(reverse)) {
+                rc_count++;
                 buffer.push_back(reinterpret_cast<const T_INT_REAL &>(reverse));
                 if (buffer.size() == buffer.capacity()) {
                     rc_set->insert(buffer.begin(), buffer.end());
@@ -646,6 +649,7 @@ void add_reverse_complements(size_t k,
                 }
             }
         }
+        logger->trace("Added {} orig and {} rc", orig_count, rc_count);
         rc_set->insert(buffer.begin(), buffer.end());
         std::vector<std::string> to_insert = rc_set->files_to_merge();
         to_merge.insert(to_merge.end(), to_insert.begin(), to_insert.end());
@@ -657,12 +661,15 @@ void add_reverse_complements(size_t k,
     // start merging #original with #reverse_complements into #kmers
     kmers->reset();
     async_worker.enqueue([to_merge = std::move(to_merge), kmers]() {
+        uint64_t total_kmers = 0;
         common::MergeDecoder<T_INT_REAL> chunked_kmers(to_merge, false);
         auto &kmers_int = reinterpret_cast<ChunkedWaitQueue<T_INT_REAL> &>(*kmers);
         while (!chunked_kmers.empty()) {
             kmers_int.push(chunked_kmers.pop());
+            total_kmers++;
         }
         kmers->shutdown();
+        logger->trace("Merge {} files with {} k-mers", to_merge.size(), total_kmers);
     });
 }