Skip to content

Commit

Permalink
using tmp_dir
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed May 25, 2022
1 parent a5f26c1 commit 957efdc
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 30 deletions.
37 changes: 20 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,49 +100,52 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the

to show the usage of the driver program (reported below for convenience).

Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [--check] [--bench] [--verbose]

Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [-d tmp_dirname] [--check] [--bench] [--verbose]
input_filename
Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:
- without duplicate nor invalid kmers
- one DNA sequence per line.
For example, it could be the de Bruijn graph topology output by BCALM.

k
K-mer length (must be <= 31).

m
Minimizer length (must be < k).

[-s seed]
Seed for construction (default is 1).

[-l l]
A (integer) constant that controls the space/time trade-off of the dictionary. A reasonable values lies between 2 and 12 (default is 6).

[-c c]
A (floating point) constant that trades construction speed for space effectiveness of minimal perfect hashing. A reasonable value lies between 3.0 and 10.0 (default is 3.000000).


[-o output_filename]
Output file name where the data structure will be serialized.

[-d tmp_dirname]
Temporary directory used for construction in external memory. Default is directory '.'.
[--canonical-parsing]
Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time.

[--weighted]
Also store the weights in compressed format.

[-o output_filename]
Output file name where the data structure will be serialized.


[--check]
Check correctness after construction.

[--bench]
Run benchmark after construction.

[--verbose]
Verbose output during construction.

[-h,--help]
Print this help text and silently exits.
Print this help text and silently exits.

Examples
Expand Down
4 changes: 2 additions & 2 deletions include/builder/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
mm::advice::sequential);
minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());
m_minimizers.build(iterator, data.minimizers.num_minimizers());
m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config);
input.close();
}
timer.stop();
Expand All @@ -86,7 +86,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b

/* step 3: build index ***/
timer.start();
auto buckets_stats = build_index(data, m_minimizers, m_buckets);
auto buckets_stats = build_index(data, m_minimizers, m_buckets, build_config);
timer.stop();
timings.push_back(timer.elapsed());
print_time(timings.back(), data.num_kmers, "step 3: 'build_index'");
Expand Down
8 changes: 4 additions & 4 deletions include/builder/build_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct bucket_pairs_iterator : std::forward_iterator_tag {
struct bucket_pairs {
static constexpr uint64_t ram_limit = 0.25 * essentials::GB;

bucket_pairs(std::string tmp_dirname = constants::default_tmp_dirname)
bucket_pairs(std::string const& tmp_dirname)
: m_buffer_size(0)
, m_num_files_to_merge(0)
, m_run_identifier(pthash::clock_type::now().time_since_epoch().count())
Expand Down Expand Up @@ -173,8 +173,8 @@ struct bucket_pairs {
}
};

buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers,
buckets& m_buckets) {
buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets,
build_configuration const& build_config) {
uint64_t num_buckets = m_minimizers.size();
uint64_t num_kmers = data.num_kmers;
uint64_t num_super_kmers = data.strings.num_super_kmers();
Expand All @@ -188,7 +188,7 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers,
mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
mm::advice::sequential);

bucket_pairs bucket_pairs_manager;
bucket_pairs bucket_pairs_manager(build_config.tmp_dirname);
uint64_t num_singletons = 0;
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); it.has_next();
it.next()) {
Expand Down
4 changes: 2 additions & 2 deletions include/builder/parse_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace sshash {

struct parse_data {
parse_data() : num_kmers(0) {}
parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {}
uint64_t num_kmers;
minimizers_tuples minimizers;
compact_string_pool strings;
Expand Down Expand Up @@ -205,7 +205,7 @@ parse_data parse_file(std::string const& filename, build_configuration const& bu
std::ifstream is(filename.c_str());
if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
std::cout << "reading file '" << filename << "'..." << std::endl;
parse_data data;
parse_data data(build_config.tmp_dirname);
if (util::ends_with(filename, ".gz")) {
zip_istream zis(is);
parse_file(zis, data, build_config);
Expand Down
2 changes: 1 addition & 1 deletion include/builder/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ struct minimizers_tuples_iterator : std::forward_iterator_tag {
struct minimizers_tuples {
static constexpr uint64_t ram_limit = 0.5 * essentials::GB;

minimizers_tuples(std::string tmp_dirname = constants::default_tmp_dirname)
minimizers_tuples(std::string const& tmp_dirname)
: m_buffer_size(0)
, m_num_files_to_merge(0)
, m_num_minimizers(0)
Expand Down
3 changes: 2 additions & 1 deletion include/minimizers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace sshash {

struct minimizers {
template <typename ForwardIterator>
void build(ForwardIterator begin, uint64_t size) {
void build(ForwardIterator begin, uint64_t size, build_configuration const& build_config) {
util::check_hash_collision_probability(size);
pthash::build_configuration mphf_config;
mphf_config.c = 6.0;
Expand All @@ -16,6 +16,7 @@ struct minimizers {
mphf_config.verbose_output = false;
mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1;
mphf_config.ram = 2 * essentials::GB;
mphf_config.tmp_dir = build_config.tmp_dirname;
m_mphf.build_in_external_memory(begin, size, mphf_config);
}

Expand Down
6 changes: 5 additions & 1 deletion include/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ struct build_configuration {

, canonical_parsing(false)
, weighted(false)
, verbose(true) {}
, verbose(true)

, tmp_dirname(constants::default_tmp_dirname) {}

uint64_t k; // kmer size
uint64_t m; // minimizer size
Expand All @@ -80,6 +82,8 @@ struct build_configuration {
bool weighted;
bool verbose;

std::string tmp_dirname;

void print() const {
std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l
<< ", c = " << c
Expand Down
12 changes: 10 additions & 2 deletions src/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,18 @@ int main(int argc, char** argv) {
"A reasonable value lies between 3.0 and 10.0 (default is " +
std::to_string(constants::c) + ").",
"-c", false);
parser.add("output_filename", "Output file name where the data structure will be serialized.",
"-o", false);
parser.add(
"tmp_dirname",
"Temporary directory used for construction in external memory. Default is directory '" +
constants::default_tmp_dirname + "'.",
"-d", false);
parser.add("canonical_parsing",
"Canonical parsing of k-mers. This option changes the parsing and results in a "
"trade-off between index space and lookup time.",
"--canonical-parsing", true);
parser.add("weighted", "Also store the weights in compressed format.", "--weighted", true);
parser.add("output_filename", "Output file name where the data structure will be serialized.",
"-o", false);
parser.add("check", "Check correctness after construction.", "--check", true);
parser.add("bench", "Run benchmark after construction.", "--bench", true);
parser.add("verbose", "Verbose output during construction.", "--verbose", true);
Expand All @@ -66,6 +71,9 @@ int main(int argc, char** argv) {
build_config.canonical_parsing = parser.get<bool>("canonical_parsing");
build_config.weighted = parser.get<bool>("weighted");
build_config.verbose = parser.get<bool>("verbose");
if (parser.parsed("tmp_dirname")) {
build_config.tmp_dirname = parser.get<std::string>("tmp_dirname");
}
build_config.print();

dict.build(input_filename, build_config);
Expand Down

0 comments on commit 957efdc

Please sign in to comment.