From 957efdcc07c4b8c0889787109dbcdc1bb5f78d93 Mon Sep 17 00:00:00 2001 From: jermp Date: Wed, 25 May 2022 22:50:30 +0200 Subject: [PATCH] using tmp_dir --- README.md | 37 ++++++++++++++++++--------------- include/builder/build.cpp | 4 ++-- include/builder/build_index.cpp | 8 +++---- include/builder/parse_file.cpp | 4 ++-- include/builder/util.hpp | 2 +- include/minimizers.hpp | 3 ++- include/util.hpp | 6 +++++- src/build.cpp | 12 +++++++++-- 8 files changed, 46 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index e66a087..7daab62 100644 --- a/README.md +++ b/README.md @@ -100,49 +100,52 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the to show the usage of the driver program (reported below for convenience). - Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [--check] [--bench] [--verbose] - + Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [-d tmp_dirname] [--check] [--bench] [--verbose] + input_filename Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not: - without duplicate nor invalid kmers - one DNA sequence per line. For example, it could be the de Bruijn graph topology output by BCALM. - + k K-mer length (must be <= 31). - + m Minimizer length (must be < k). - + [-s seed] Seed for construction (default is 1). - + [-l l] A (integer) constant that controls the space/time trade-off of the dictionary. A reasonable values lies between 2 and 12 (default is 6). - + [-c c] A (floating point) constant that trades construction speed for space effectiveness of minimal perfect hashing. A reasonable value lies between 3.0 and 10.0 (default is 3.000000). - + + [-o output_filename] + Output file name where the data structure will be serialized. + + [-d tmp_dirname] + Temporary directory used for construction in external memory. Default is directory '.'. + [--canonical-parsing] Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time. - + [--weighted] Also store the weights in compressed format. - - [-o output_filename] - Output file name where the data structure will be serialized. - + [--check] Check correctness after construction. - + [--bench] Run benchmark after construction. - + [--verbose] Verbose output during construction. - + [-h,--help] - Print this help text and silently exits. + Print this help text and silently exits. Examples diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 999a4d8..36ef252 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -75,7 +75,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b mm::file_source input(data.minimizers.get_minimizers_filename(), mm::advice::sequential); minimizers_tuples_iterator iterator(input.data(), input.data() + input.size()); - m_minimizers.build(iterator, data.minimizers.num_minimizers()); + m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config); input.close(); } timer.stop(); @@ -86,7 +86,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b /* step 3: build index ***/ timer.start(); - auto buckets_stats = build_index(data, m_minimizers, m_buckets); + auto buckets_stats = build_index(data, m_minimizers, m_buckets, build_config); timer.stop(); timings.push_back(timer.elapsed()); print_time(timings.back(), data.num_kmers, "step 3: 'build_index'"); diff --git a/include/builder/build_index.cpp b/include/builder/build_index.cpp index 58d8a44..b44ef3f 100644 --- a/include/builder/build_index.cpp +++ b/include/builder/build_index.cpp @@ -36,7 +36,7 @@ struct bucket_pairs_iterator : std::forward_iterator_tag { struct bucket_pairs { static constexpr uint64_t ram_limit = 0.25 * essentials::GB; - bucket_pairs(std::string tmp_dirname = constants::default_tmp_dirname) + bucket_pairs(std::string const& tmp_dirname) : m_buffer_size(0) , m_num_files_to_merge(0) , m_run_identifier(pthash::clock_type::now().time_since_epoch().count()) @@ -173,8 +173,8 @@ struct bucket_pairs { } }; -buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, - buckets& m_buckets) { +buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets, + build_configuration const& build_config) { uint64_t num_buckets = m_minimizers.size(); uint64_t num_kmers = data.num_kmers; uint64_t num_super_kmers = data.strings.num_super_kmers(); @@ -188,7 +188,7 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, mm::file_source input(data.minimizers.get_minimizers_filename(), mm::advice::sequential); - bucket_pairs bucket_pairs_manager; + bucket_pairs bucket_pairs_manager(build_config.tmp_dirname); uint64_t num_singletons = 0; for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); it.has_next(); it.next()) { diff --git a/include/builder/parse_file.cpp b/include/builder/parse_file.cpp index db1dfe6..01f940a 100644 --- a/include/builder/parse_file.cpp +++ b/include/builder/parse_file.cpp @@ -3,7 +3,7 @@ namespace sshash { struct parse_data { - parse_data() : num_kmers(0) {} + parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {} uint64_t num_kmers; minimizers_tuples minimizers; compact_string_pool strings; @@ -205,7 +205,7 @@ parse_data parse_file(std::string const& filename, build_configuration const& bu std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); std::cout << "reading file '" << filename << "'..." << std::endl; - parse_data data; + parse_data data(build_config.tmp_dirname); if (util::ends_with(filename, ".gz")) { zip_istream zis(is); parse_file(zis, data, build_config); diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 9a328f5..442bf14 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -160,7 +160,7 @@ struct minimizers_tuples_iterator : std::forward_iterator_tag { struct minimizers_tuples { static constexpr uint64_t ram_limit = 0.5 * essentials::GB; - minimizers_tuples(std::string tmp_dirname = constants::default_tmp_dirname) + minimizers_tuples(std::string const& tmp_dirname) : m_buffer_size(0) , m_num_files_to_merge(0) , m_num_minimizers(0) diff --git a/include/minimizers.hpp b/include/minimizers.hpp index ebbdb54..b228415 100644 --- a/include/minimizers.hpp +++ b/include/minimizers.hpp @@ -6,7 +6,7 @@ namespace sshash { struct minimizers { template - void build(ForwardIterator begin, uint64_t size) { + void build(ForwardIterator begin, uint64_t size, build_configuration const& build_config) { util::check_hash_collision_probability(size); pthash::build_configuration mphf_config; mphf_config.c = 6.0; @@ -16,6 +16,7 @@ struct minimizers { mphf_config.verbose_output = false; mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1; mphf_config.ram = 2 * essentials::GB; + mphf_config.tmp_dir = build_config.tmp_dirname; m_mphf.build_in_external_memory(begin, size, mphf_config); } diff --git a/include/util.hpp b/include/util.hpp index caa5329..fb3a53b 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -67,7 +67,9 @@ struct build_configuration { , canonical_parsing(false) , weighted(false) - , verbose(true) {} + , verbose(true) + + , tmp_dirname(constants::default_tmp_dirname) {} uint64_t k; // kmer size uint64_t m; // minimizer size @@ -80,6 +82,8 @@ struct build_configuration { bool weighted; bool verbose; + std::string tmp_dirname; + void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c diff --git a/src/build.cpp b/src/build.cpp index 67f258c..15982b8 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -37,13 +37,18 @@ int main(int argc, char** argv) { "A reasonable value lies between 3.0 and 10.0 (default is " + std::to_string(constants::c) + ").", "-c", false); + parser.add("output_filename", "Output file name where the data structure will be serialized.", + "-o", false); + parser.add( + "tmp_dirname", + "Temporary directory used for construction in external memory. Default is directory '" + + constants::default_tmp_dirname + "'.", + "-d", false); parser.add("canonical_parsing", "Canonical parsing of k-mers. This option changes the parsing and results in a " "trade-off between index space and lookup time.", "--canonical-parsing", true); parser.add("weighted", "Also store the weights in compressed format.", "--weighted", true); - parser.add("output_filename", "Output file name where the data structure will be serialized.", - "-o", false); parser.add("check", "Check correctness after construction.", "--check", true); parser.add("bench", "Run benchmark after construction.", "--bench", true); parser.add("verbose", "Verbose output during construction.", "--verbose", true); @@ -66,6 +71,9 @@ int main(int argc, char** argv) { build_config.canonical_parsing = parser.get("canonical_parsing"); build_config.weighted = parser.get("weighted"); build_config.verbose = parser.get("verbose"); + if (parser.parsed("tmp_dirname")) { + build_config.tmp_dirname = parser.get("tmp_dirname"); + } build_config.print(); dict.build(input_filename, build_config);