From 957efdcc07c4b8c0889787109dbcdc1bb5f78d93 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Wed, 25 May 2022 22:50:30 +0200
Subject: [PATCH] using tmp_dir

---
 README.md                       | 37 ++++++++++++++++++---------------
 include/builder/build.cpp       |  4 ++--
 include/builder/build_index.cpp |  8 +++----
 include/builder/parse_file.cpp  |  4 ++--
 include/builder/util.hpp        |  2 +-
 include/minimizers.hpp          |  3 ++-
 include/util.hpp                |  6 +++++-
 src/build.cpp                   | 12 +++++++++--
 8 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index e66a087..7daab62 100644
--- a/README.md
+++ b/README.md
@@ -100,49 +100,52 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the
 
 to show the usage of the driver program (reported below for convenience).
 
-	Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [--check] [--bench] [--verbose]
-
+	Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [-d tmp_dirname] [--check] [--bench] [--verbose]
+	
 	 input_filename
 		Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:
 		- without duplicate nor invalid kmers
 		- one DNA sequence per line.
 		For example, it could be the de Bruijn graph topology output by BCALM.
-
+	
 	 k
 		K-mer length (must be <= 31).
-
+	
 	 m
 		Minimizer length (must be < k).
-
+	
 	 [-s seed]
 		Seed for construction (default is 1).
-
+	
 	 [-l l]
 		A (integer) constant that controls the space/time trade-off of the dictionary. A reasonable values lies between 2 and 12 (default is 6).
-
+	
 	 [-c c]
 		A (floating point) constant that trades construction speed for space effectiveness of minimal perfect hashing. A reasonable value lies between 3.0 and 10.0 (default is 3.000000).
-
+	
+	 [-o output_filename]
+		Output file name where the data structure will be serialized.
+	
+	 [-d tmp_dirname]
+		Temporary directory used for construction in external memory. Default is directory '.'.
+		
 	 [--canonical-parsing]
 		Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time.
-
+	
 	 [--weighted]
 		Also store the weights in compressed format.
-
-	 [-o output_filename]
-		Output file name where the data structure will be serialized.
-
+	
 	 [--check]
 		Check correctness after construction.
-
+	
 	 [--bench]
 		Run benchmark after construction.
-
+	
 	 [--verbose]
 		Verbose output during construction.
-
+	
 	 [-h,--help]
-		Print this help text and silently exits.
+	Print this help text and silently exits.
 		
 
 Examples
diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 999a4d8..36ef252 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -75,7 +75,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
         mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
                                                mm::advice::sequential);
         minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());
-        m_minimizers.build(iterator, data.minimizers.num_minimizers());
+        m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config);
         input.close();
     }
     timer.stop();
@@ -86,7 +86,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
 
     /* step 3: build index ***/
     timer.start();
-    auto buckets_stats = build_index(data, m_minimizers, m_buckets);
+    auto buckets_stats = build_index(data, m_minimizers, m_buckets, build_config);
     timer.stop();
     timings.push_back(timer.elapsed());
     print_time(timings.back(), data.num_kmers, "step 3: 'build_index'");
diff --git a/include/builder/build_index.cpp b/include/builder/build_index.cpp
index 58d8a44..b44ef3f 100644
--- a/include/builder/build_index.cpp
+++ b/include/builder/build_index.cpp
@@ -36,7 +36,7 @@ struct bucket_pairs_iterator : std::forward_iterator_tag {
 struct bucket_pairs {
     static constexpr uint64_t ram_limit = 0.25 * essentials::GB;
 
-    bucket_pairs(std::string tmp_dirname = constants::default_tmp_dirname)
+    bucket_pairs(std::string const& tmp_dirname)
         : m_buffer_size(0)
         , m_num_files_to_merge(0)
         , m_run_identifier(pthash::clock_type::now().time_since_epoch().count())
@@ -173,8 +173,8 @@ struct bucket_pairs {
     }
 };
 
-buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers,
-                               buckets& m_buckets) {
+buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets,
+                               build_configuration const& build_config) {
     uint64_t num_buckets = m_minimizers.size();
     uint64_t num_kmers = data.num_kmers;
     uint64_t num_super_kmers = data.strings.num_super_kmers();
@@ -188,7 +188,7 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers,
     mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
                                            mm::advice::sequential);
 
-    bucket_pairs bucket_pairs_manager;
+    bucket_pairs bucket_pairs_manager(build_config.tmp_dirname);
     uint64_t num_singletons = 0;
     for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); it.has_next();
          it.next()) {
diff --git a/include/builder/parse_file.cpp b/include/builder/parse_file.cpp
index db1dfe6..01f940a 100644
--- a/include/builder/parse_file.cpp
+++ b/include/builder/parse_file.cpp
@@ -3,7 +3,7 @@
 namespace sshash {
 
 struct parse_data {
-    parse_data() : num_kmers(0) {}
+    parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {}
     uint64_t num_kmers;
     minimizers_tuples minimizers;
     compact_string_pool strings;
@@ -205,7 +205,7 @@ parse_data parse_file(std::string const& filename, build_configuration const& bu
     std::ifstream is(filename.c_str());
     if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
     std::cout << "reading file '" << filename << "'..." << std::endl;
-    parse_data data;
+    parse_data data(build_config.tmp_dirname);
     if (util::ends_with(filename, ".gz")) {
         zip_istream zis(is);
         parse_file(zis, data, build_config);
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 9a328f5..442bf14 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -160,7 +160,7 @@ struct minimizers_tuples_iterator : std::forward_iterator_tag {
 struct minimizers_tuples {
     static constexpr uint64_t ram_limit = 0.5 * essentials::GB;
 
-    minimizers_tuples(std::string tmp_dirname = constants::default_tmp_dirname)
+    minimizers_tuples(std::string const& tmp_dirname)
         : m_buffer_size(0)
         , m_num_files_to_merge(0)
         , m_num_minimizers(0)
diff --git a/include/minimizers.hpp b/include/minimizers.hpp
index ebbdb54..b228415 100644
--- a/include/minimizers.hpp
+++ b/include/minimizers.hpp
@@ -6,7 +6,7 @@ namespace sshash {
 
 struct minimizers {
     template <typename ForwardIterator>
-    void build(ForwardIterator begin, uint64_t size) {
+    void build(ForwardIterator begin, uint64_t size, build_configuration const& build_config) {
         util::check_hash_collision_probability(size);
         pthash::build_configuration mphf_config;
         mphf_config.c = 6.0;
@@ -16,6 +16,7 @@ struct minimizers {
         mphf_config.verbose_output = false;
         mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1;
         mphf_config.ram = 2 * essentials::GB;
+        mphf_config.tmp_dir = build_config.tmp_dirname;
         m_mphf.build_in_external_memory(begin, size, mphf_config);
     }
 
diff --git a/include/util.hpp b/include/util.hpp
index caa5329..fb3a53b 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -67,7 +67,9 @@ struct build_configuration {
 
         , canonical_parsing(false)
         , weighted(false)
-        , verbose(true) {}
+        , verbose(true)
+
+        , tmp_dirname(constants::default_tmp_dirname) {}
 
     uint64_t k;  // kmer size
     uint64_t m;  // minimizer size
@@ -80,6 +82,8 @@ struct build_configuration {
     bool weighted;
     bool verbose;
 
+    std::string tmp_dirname;
+
     void print() const {
         std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l
                   << ", c = " << c
diff --git a/src/build.cpp b/src/build.cpp
index 67f258c..15982b8 100644
--- a/src/build.cpp
+++ b/src/build.cpp
@@ -37,13 +37,18 @@ int main(int argc, char** argv) {
                "A reasonable value lies between 3.0 and 10.0 (default is " +
                    std::to_string(constants::c) + ").",
                "-c", false);
+    parser.add("output_filename", "Output file name where the data structure will be serialized.",
+               "-o", false);
+    parser.add(
+        "tmp_dirname",
+        "Temporary directory used for construction in external memory. Default is directory '" +
+            constants::default_tmp_dirname + "'.",
+        "-d", false);
     parser.add("canonical_parsing",
                "Canonical parsing of k-mers. This option changes the parsing and results in a "
                "trade-off between index space and lookup time.",
                "--canonical-parsing", true);
     parser.add("weighted", "Also store the weights in compressed format.", "--weighted", true);
-    parser.add("output_filename", "Output file name where the data structure will be serialized.",
-               "-o", false);
     parser.add("check", "Check correctness after construction.", "--check", true);
     parser.add("bench", "Run benchmark after construction.", "--bench", true);
     parser.add("verbose", "Verbose output during construction.", "--verbose", true);
@@ -66,6 +71,9 @@ int main(int argc, char** argv) {
     build_config.canonical_parsing = parser.get<bool>("canonical_parsing");
     build_config.weighted = parser.get<bool>("weighted");
     build_config.verbose = parser.get<bool>("verbose");
+    if (parser.parsed("tmp_dirname")) {
+        build_config.tmp_dirname = parser.get<std::string>("tmp_dirname");
+    }
     build_config.print();
 
     dict.build(input_filename, build_config);