Skip to content

Commit

Permalink
general renaming to match description in papers
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed May 24, 2022
1 parent b218b63 commit c48948c
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 188 deletions.
24 changes: 12 additions & 12 deletions include/buckets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ struct buckets {
}

std::pair<uint64_t, uint64_t> locate_bucket(uint64_t bucket_id) const {
uint64_t begin = num_strings_before_bucket.access(bucket_id) + bucket_id;
uint64_t end = num_strings_before_bucket.access(bucket_id + 1) + bucket_id + 1;
uint64_t begin = num_super_kmers_before_bucket.access(bucket_id) + bucket_id;
uint64_t end = num_super_kmers_before_bucket.access(bucket_id + 1) + bucket_id + 1;
assert(begin < end);
return {begin, end};
}
Expand All @@ -59,8 +59,8 @@ struct buckets {

uint64_t lookup(uint64_t begin, uint64_t end, uint64_t target_kmer, uint64_t k,
uint64_t m) const {
for (uint64_t string_id = begin; string_id != end; ++string_id) {
uint64_t offset = offsets.access(string_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = offsets.access(super_kmer_id);
auto [kmer_id, offset_end] = offset_to_id(offset, k);
bit_vector_iterator bv_it(strings, 2 * offset);
uint64_t window_size = std::min<uint64_t>(k - m + 1, offset_end - offset - k + 1);
Expand All @@ -72,9 +72,9 @@ struct buckets {
return constants::invalid;
}

uint64_t lookup_in_string(uint64_t string_id, uint64_t target_kmer, uint64_t k,
uint64_t m) const {
uint64_t offset = offsets.access(string_id);
uint64_t lookup_in_super_kmer(uint64_t super_kmer_id, uint64_t target_kmer, uint64_t k,
uint64_t m) const {
uint64_t offset = offsets.access(super_kmer_id);
auto [kmer_id, offset_end] = offset_to_id(offset, k);
bit_vector_iterator bv_it(strings, 2 * offset);
uint64_t window_size = std::min<uint64_t>(k - m + 1, offset_end - offset - k + 1);
Expand All @@ -93,8 +93,8 @@ struct buckets {
}
uint64_t lookup_canonical(uint64_t begin, uint64_t end, uint64_t target_kmer,
uint64_t target_kmer_rc, uint64_t k, uint64_t m) const {
for (uint64_t string_id = begin; string_id != end; ++string_id) {
uint64_t offset = offsets.access(string_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = offsets.access(super_kmer_id);
auto [kmer_id, offset_end] = offset_to_id(offset, k);
bit_vector_iterator bv_it(strings, 2 * offset);
uint64_t window_size = std::min<uint64_t>(k - m + 1, offset_end - offset - k + 1);
Expand Down Expand Up @@ -183,20 +183,20 @@ struct buckets {
}

uint64_t num_bits() const {
return pieces.num_bits() + num_strings_before_bucket.num_bits() +
return pieces.num_bits() + num_super_kmers_before_bucket.num_bits() +
8 * (offsets.bytes() + strings.bytes());
}

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(pieces);
visitor.visit(num_strings_before_bucket);
visitor.visit(num_super_kmers_before_bucket);
visitor.visit(offsets);
visitor.visit(strings);
}

ef_sequence<true> pieces;
ef_sequence<false> num_strings_before_bucket;
ef_sequence<false> num_super_kmers_before_bucket;
pthash::compact_vector offsets;
pthash::bit_vector strings;
};
Expand Down
142 changes: 71 additions & 71 deletions include/builder/build.cpp

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions include/builder/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ struct compact_string_pool {
compact_string_pool() {}

struct builder {
builder(uint64_t k) : k(k), offset(0), num_strings(0) {}
builder(uint64_t k) : k(k), offset(0), num_super_kmers(0) {}

void build(compact_string_pool& pool) {
pool.k = k;
pool.num_strings = num_strings;
pool.m_num_super_kmers = num_super_kmers;
pool.pieces.swap(pieces);
pool.strings.build(&bvb_strings);
}
Expand All @@ -43,7 +42,7 @@ struct compact_string_pool {
for (uint64_t i = prefix; i != size; ++i) {
bvb_strings.append_bits(util::char_to_uint64(string[i]), 2);
}
num_strings += 1;
num_super_kmers += 1;
offset = bvb_strings.size() / 2;
}

Expand All @@ -59,29 +58,30 @@ struct compact_string_pool {

uint64_t k;
uint64_t offset;
uint64_t num_strings;
uint64_t num_super_kmers;
std::vector<uint64_t> pieces;
pthash::bit_vector_builder bvb_strings;
};

uint64_t num_bits() const { return strings.size(); }
uint64_t size() const { return num_strings; }
uint64_t num_super_kmers() const { return m_num_super_kmers; }

uint64_t k;
uint64_t num_strings;
std::vector<uint64_t> pieces;
pthash::bit_vector strings;

private:
uint64_t m_num_super_kmers;
};

typedef uint8_t num_kmers_in_string_uint_type;
typedef uint8_t num_kmers_in_super_kmer_uint_type;

#pragma pack(push, 1)
struct minimizer_tuple {
minimizer_tuple(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_string)
: minimizer(minimizer), offset(offset), num_kmers_in_string(num_kmers_in_string) {}
minimizer_tuple(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_super_kmer)
: minimizer(minimizer), offset(offset), num_kmers_in_super_kmer(num_kmers_in_super_kmer) {}
uint64_t minimizer;
uint64_t offset;
num_kmers_in_string_uint_type num_kmers_in_string;
num_kmers_in_super_kmer_uint_type num_kmers_in_super_kmer;
};
#pragma pack(pop)

Expand All @@ -94,7 +94,7 @@ struct list_type {
iterator(std::vector<minimizer_tuple>::iterator begin) : m_begin(begin) {}

inline std::pair<uint64_t, uint64_t> operator*() const {
return {(*m_begin).offset, (*m_begin).num_kmers_in_string};
return {(*m_begin).offset, (*m_begin).num_kmers_in_super_kmer};
}

inline void operator++() { ++m_begin; }
Expand All @@ -121,8 +121,8 @@ struct minimizers_tuples {
// tuples.reserve(n);
// }

void emplace_back(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_string) {
tuples.emplace_back(minimizer, offset, num_kmers_in_string);
void emplace_back(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_super_kmer) {
tuples.emplace_back(minimizer, offset, num_kmers_in_super_kmer);
}

minimizer_tuple& back() { return tuples.back(); }
Expand Down
8 changes: 3 additions & 5 deletions include/info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ uint64_t skew_index::print_info() const {
num_kmers_in_skew_index += n;
lower = upper;
upper = 2 * lower;
// if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket;
}
return num_kmers_in_skew_index;
}
Expand All @@ -31,8 +30,8 @@ void dictionary::print_space_breakdown() const {
<< " [bits/kmer]\n";
std::cout << " pieces: " << static_cast<double>(m_buckets.pieces.num_bits()) / size()
<< " [bits/kmer]\n";
std::cout << " num_strings_before_bucket: "
<< static_cast<double>(m_buckets.num_strings_before_bucket.num_bits()) / size()
std::cout << " num_super_kmers_before_bucket: "
<< static_cast<double>(m_buckets.num_super_kmers_before_bucket.num_bits()) / size()
<< " [bits/kmer]\n";
std::cout << " offsets: " << static_cast<double>(8 * m_buckets.offsets.bytes()) / size()
<< " [bits/kmer]\n";
Expand All @@ -57,10 +56,9 @@ void dictionary::print_info() const {
std::cout << "canonicalized = " << (canonicalized() ? "true" : "false") << '\n';
std::cout << "weighted = " << (weighted() ? "true" : "false") << '\n';

std::cout << "num_strings = " << m_buckets.offsets.size() << '\n';
std::cout << "num_super_kmers = " << m_buckets.offsets.size() << '\n';
std::cout << "num_pieces = " << m_buckets.pieces.size() << " (+"
<< (2.0 * m_buckets.pieces.size() * (k() - 1)) / size() << " [bits/kmer])" << '\n';
std::cout << "num_symbols_in_string = " << m_buckets.strings.size() / 2 << '\n';
std::cout << "bits_per_offset = ceil(log2(" << m_buckets.strings.size() / 2
<< ")) = " << std::ceil(std::log2(m_buckets.strings.size() / 2)) << '\n';
uint64_t num_kmers_in_skew_index = m_skew_index.print_info();
Expand Down
33 changes: 17 additions & 16 deletions include/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ uint64_t dictionary::lookup_uint64_regular_parsing(uint64_t uint64_kmer) const {
if (m_skew_index.empty()) return m_buckets.lookup(bucket_id, uint64_kmer, m_k, m_m);

auto [begin, end] = m_buckets.locate_bucket(bucket_id);
uint64_t num_strings_in_bucket = end - begin;
uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket);
if (log2_num_strings_in_bucket > m_skew_index.min_log2) {
uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_strings_in_bucket);
/* It must hold pos < num_strings_in_bucket for the kmer to exist. */
if (pos < num_strings_in_bucket) {
return m_buckets.lookup_in_string(begin + pos, uint64_kmer, m_k, m_m);
uint64_t num_super_kmers_in_bucket = end - begin;
uint64_t log2_num_super_kmers_in_bucket = util::ceil_log2_uint32(num_super_kmers_in_bucket);
if (log2_num_super_kmers_in_bucket > m_skew_index.min_log2) {
uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_super_kmers_in_bucket);
/* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */
if (pos < num_super_kmers_in_bucket) {
return m_buckets.lookup_in_super_kmer(begin + pos, uint64_kmer, m_k, m_m);
}
return constants::invalid;
}
Expand All @@ -34,17 +34,18 @@ uint64_t dictionary::lookup_uint64_canonical_parsing(uint64_t uint64_kmer) const
}

auto [begin, end] = m_buckets.locate_bucket(bucket_id);
uint64_t num_strings_in_bucket = end - begin;
uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket);
if (log2_num_strings_in_bucket > m_skew_index.min_log2) {
uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_strings_in_bucket);
if (pos < num_strings_in_bucket) {
uint64_t kmer_id = m_buckets.lookup_in_string(begin + pos, uint64_kmer, m_k, m_m);
uint64_t num_super_kmers_in_bucket = end - begin;
uint64_t log2_num_super_kmers_in_bucket = util::ceil_log2_uint32(num_super_kmers_in_bucket);
if (log2_num_super_kmers_in_bucket > m_skew_index.min_log2) {
uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_super_kmers_in_bucket);
if (pos < num_super_kmers_in_bucket) {
uint64_t kmer_id = m_buckets.lookup_in_super_kmer(begin + pos, uint64_kmer, m_k, m_m);
if (kmer_id != constants::invalid) return kmer_id;
}
uint64_t pos_rc = m_skew_index.lookup(uint64_kmer_rc, log2_num_strings_in_bucket);
if (pos_rc < num_strings_in_bucket) {
uint64_t kmer_id = m_buckets.lookup_in_string(begin + pos_rc, uint64_kmer_rc, m_k, m_m);
uint64_t pos_rc = m_skew_index.lookup(uint64_kmer_rc, log2_num_super_kmers_in_bucket);
if (pos_rc < num_super_kmers_in_bucket) {
uint64_t kmer_id =
m_buckets.lookup_in_super_kmer(begin + pos_rc, uint64_kmer_rc, m_k, m_m);
return kmer_id;
}
return constants::invalid;
Expand Down
22 changes: 12 additions & 10 deletions include/query/membership_query_canonical_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,19 @@ struct membership_query_canonical_parsing {
int is_member() {
bool check_minimizer = !same_minimizer();
if (!m_dict->m_skew_index.empty()) {
uint64_t num_strings_in_bucket = m_end - m_begin;
uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket);
if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_strings_in_bucket);
if (p < num_strings_in_bucket) {
uint64_t num_super_kmers_in_bucket = m_end - m_begin;
uint64_t log2_num_super_kmers_in_bucket =
util::ceil_log2_uint32(num_super_kmers_in_bucket);
if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_super_kmers_in_bucket);
if (p < num_super_kmers_in_bucket) {
int ret = is_member(m_begin + p, m_begin + p + 1, check_minimizer);
if (ret != return_value::KMER_NOT_FOUND) return ret;
check_minimizer = false;
}
uint64_t p_rc = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_strings_in_bucket);
if (p_rc < num_strings_in_bucket) {
uint64_t p_rc =
m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_super_kmers_in_bucket);
if (p_rc < num_super_kmers_in_bucket) {
int ret = is_member(m_begin + p_rc, m_begin + p_rc + 1, check_minimizer);
if (ret != return_value::KMER_NOT_FOUND) return ret;
}
Expand All @@ -167,8 +169,8 @@ struct membership_query_canonical_parsing {
}

int is_member(uint64_t begin, uint64_t end, bool check_minimizer) {
for (uint64_t string_id = begin; string_id != end; ++string_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(string_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id);
uint64_t pos_in_string = 2 * offset;
m_reverse = false;
m_string_iterator.at(pos_in_string);
Expand All @@ -180,7 +182,7 @@ struct membership_query_canonical_parsing {
while (m_pos_in_window != m_window_size) {
uint64_t val = m_string_iterator.read(2 * m_k);

if (check_minimizer and string_id == begin and m_pos_in_window == 0) {
if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) {
uint64_t val_rc = util::compute_reverse_complement(val, m_k);
uint64_t minimizer =
std::min<uint64_t>(util::compute_minimizer(val, m_k, m_m, m_seed),
Expand Down
34 changes: 18 additions & 16 deletions include/query/membership_query_regular_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,12 @@ struct membership_query_regular_parsing {
int is_member() {
bool check_minimizer = !same_minimizer();
if (!m_dict->m_skew_index.empty()) {
uint64_t num_strings_in_bucket = m_end - m_begin;
uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket);
if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_strings_in_bucket);
if (p < num_strings_in_bucket) {
uint64_t num_super_kmers_in_bucket = m_end - m_begin;
uint64_t log2_num_super_kmers_in_bucket =
util::ceil_log2_uint32(num_super_kmers_in_bucket);
if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_super_kmers_in_bucket);
if (p < num_super_kmers_in_bucket) {
int ret = is_member(m_begin + p, m_begin + p + 1, check_minimizer);
if (ret != return_value::KMER_NOT_FOUND) return ret;
}
Expand All @@ -198,11 +199,12 @@ struct membership_query_regular_parsing {
int is_member_rc() {
bool check_minimizer = !same_minimizer_rc();
if (!m_dict->m_skew_index.empty()) {
uint64_t num_strings_in_bucket = m_end - m_begin;
uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket);
if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_strings_in_bucket);
if (p < num_strings_in_bucket) {
uint64_t num_super_kmers_in_bucket = m_end - m_begin;
uint64_t log2_num_super_kmers_in_bucket =
util::ceil_log2_uint32(num_super_kmers_in_bucket);
if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) {
uint64_t p = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_super_kmers_in_bucket);
if (p < num_super_kmers_in_bucket) {
int ret = is_member_rc(m_begin + p, m_begin + p + 1, check_minimizer);
if (ret != return_value::KMER_NOT_FOUND) return ret;
}
Expand All @@ -213,8 +215,8 @@ struct membership_query_regular_parsing {
}

int is_member(uint64_t begin, uint64_t end, bool check_minimizer) {
for (uint64_t string_id = begin; string_id != end; ++string_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(string_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id);
uint64_t pos_in_string = 2 * offset;
m_reverse = false;
m_string_iterator.at(pos_in_string);
Expand All @@ -226,7 +228,7 @@ struct membership_query_regular_parsing {
while (m_pos_in_window != m_window_size) {
uint64_t val = m_string_iterator.read(2 * m_k);

if (check_minimizer and string_id == begin and m_pos_in_window == 0) {
if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) {
uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed);
if (minimizer != m_curr_minimizer) return return_value::MINIMIZER_NOT_FOUND;
}
Expand All @@ -247,8 +249,8 @@ struct membership_query_regular_parsing {
}

int is_member_rc(uint64_t begin, uint64_t end, bool check_minimizer) {
for (uint64_t string_id = begin; string_id != end; ++string_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(string_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id);
uint64_t pos_in_string = 2 * offset;
m_reverse = false;
m_string_iterator.at(pos_in_string);
Expand All @@ -260,7 +262,7 @@ struct membership_query_regular_parsing {
while (m_pos_in_window != m_window_size) {
uint64_t val = m_string_iterator.read(2 * m_k);

if (check_minimizer and string_id == begin and m_pos_in_window == 0) {
if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) {
uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed);
if (minimizer != m_curr_minimizer_rc) {
return return_value::MINIMIZER_NOT_FOUND;
Expand Down
Loading

0 comments on commit c48948c

Please sign in to comment.