Skip to content

Commit

Permalink
feat: support SAM files as sequence input and allow partial sequence …
Browse files Browse the repository at this point in the history
…input with an offset
  • Loading branch information
David Gichev authored and Taepper committed Jul 9, 2024
1 parent f3264e0 commit 0867963
Show file tree
Hide file tree
Showing 61 changed files with 614 additions and 884 deletions.
25 changes: 0 additions & 25 deletions include/silo/common/fasta_reader.h

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <stdexcept>
#include <string>

namespace silo {
namespace silo::file_reader {

class FastaFormatException : public std::runtime_error {
public:
Expand Down
17 changes: 17 additions & 0 deletions include/silo/file_reader/fasta_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

#include "silo/file_reader/file_reader.h"

namespace silo::file_reader {

class FastaReader : public FileReader {
public:
std::optional<ReadSequence> nextEntry() override;
explicit FastaReader(const std::filesystem::path& in_file_name)
: FileReader(in_file_name) {}
};
} // namespace silo
29 changes: 29 additions & 0 deletions include/silo/file_reader/file_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <optional>
#include <string>

#include "silo/common/input_stream_wrapper.h"

namespace silo::file_reader {
class FileReader {
protected:
explicit FileReader(const std::filesystem::path& in_file_name)
: in_file(in_file_name){};

silo::InputStreamWrapper in_file;

public:
struct ReadSequence {
std::string key;
uint32_t offset;
std::string sequence;
};

virtual std::optional<ReadSequence> nextEntry() = 0;

void reset();

virtual ~FileReader(){};
};
} // namespace silo::file_reader
13 changes: 13 additions & 0 deletions include/silo/file_reader/sam_format_exception.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <stdexcept>
#include <string>

namespace silo::file_reader {

class SamFormatException : public std::runtime_error {
public:
explicit SamFormatException(const std::string& error_message);
};

} // namespace silo
18 changes: 18 additions & 0 deletions include/silo/file_reader/sam_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

#include "silo/common/string_utils.h"
#include "silo/file_reader/fasta_format_exception.h"
#include "silo/common/input_stream_wrapper.h"

namespace silo::file_reader {
class SamReader : public FileReader {
public:
std::optional<ReadSequence> nextEntry() override;
explicit SamReader(const std::filesystem::path& in_file_name)
: FileReader(in_file_name) {}
};
} // namespace silo
18 changes: 15 additions & 3 deletions include/silo/preprocessing/preprocessing_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

namespace silo {

class ZstdFastaTable;
class ZstdTable;
class ReferenceGenomes;
class CompressSequence;

Expand Down Expand Up @@ -42,13 +42,25 @@ class PreprocessingDatabase {

std::unique_ptr<duckdb::MaterializedQueryResult> query(std::string sql_query);

ZstdFastaTable generateSequenceTableFromFasta(
ZstdTable generateSequenceTableViaFile(
const std::string& table_name,
const std::string& reference_sequence,
const std::filesystem::path& file_path
);

ZstdTable generateSequenceTableFromFasta(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
);

ZstdTable generateSequenceTableFromZstdFasta(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
);

ZstdFastaTable generateSequenceTableFromZstdFasta(
ZstdTable generateSequenceTableFromSAM(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
Expand Down
15 changes: 15 additions & 0 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"
#include "silo/common/table_reader.h"
#include "silo/storage/sequence_store.h"
#include "silo/zstd/zstd_decompressor.h"

namespace silo {
class Database;
Expand Down Expand Up @@ -88,6 +91,18 @@ class Preprocessor {
const std::string& order_by_clause
);

template <typename SymbolType>
ColumnFunction createRawReadLambda(
ZstdDecompressor& decompressor,
silo::SequenceStorePartition<SymbolType>& sequence_store
);

template <typename SymbolType>
ColumnFunction createInsertionLambda(
const std::string& sequence_name,
silo::SequenceStorePartition<SymbolType>& sequence_store
);

template <typename SymbolType>
void buildSequenceStore(
Database& database,
Expand Down
2 changes: 1 addition & 1 deletion include/silo/preprocessing/sql_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <duckdb.hpp>

#include "silo/storage/pango_lineage_alias.h"
#include "silo/zstdfasta/zstd_compressor.h"
#include "silo/zstd/zstd_compressor.h"

namespace silo {

Expand Down
36 changes: 31 additions & 5 deletions include/silo/storage/sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
#include <vector>

#include <fmt/format.h>
#include <duckdb/main/connection.hpp>
#include <roaring/roaring.hh>

#include "silo/common/aa_symbols.h"
#include "silo/common/format_number.h"
#include "silo/common/nucleotide_symbols.h"
#include "silo/common/symbol_map.h"
#include "silo/common/table_reader.h"
#include "silo/storage/insertion_index.h"
#include "silo/storage/position.h"

Expand All @@ -25,14 +27,27 @@ class access;
namespace silo {
template <typename SymbolType>
class Position;
class ZstdFastaTableReader;
class ZstdTableReader;

struct SequenceStoreInfo {
uint32_t sequence_count;
uint64_t size;
size_t n_bitmaps_size;
};

struct ReadSequence {
bool is_valid = false;
std::string sequence = "";
uint32_t offset;

ReadSequence(std::string_view _sequence, uint32_t _offset = 0)
: sequence(std::move(_sequence)),
offset(_offset),
is_valid(true) {}

ReadSequence() {}
};

template <typename SymbolType>
class SequenceStorePartition {
friend class boost::serialization::access;
Expand Down Expand Up @@ -60,19 +75,21 @@ class SequenceStorePartition {
uint32_t sequence_count = 0;

private:
void fillIndexes(const std::vector<std::optional<std::string>>& genomes);
void fillIndexes(const std::vector<ReadSequence>& reads);

void addSymbolsToPositions(
size_t position_idx,
SymbolMap<SymbolType, std::vector<uint32_t>>& ids_per_symbol_for_current_position,
size_t number_of_sequences
);

void fillNBitmaps(const std::vector<std::optional<std::string>>& genomes);
void fillNBitmaps(const std::vector<ReadSequence>& reads);

void optimizeBitmaps();

public:
static constexpr size_t BUFFER_SIZE = 1024;
std::vector<ReadSequence> lazy_buffer;
explicit SequenceStorePartition(
const std::vector<typename SymbolType::Symbol>& reference_sequence
);
Expand All @@ -86,11 +103,20 @@ class SequenceStorePartition {

[[nodiscard]] SequenceStoreInfo getInfo() const;

size_t fill(silo::ZstdFastaTableReader& input);
size_t fill(
duckdb::Connection& connection,
std::string table_name,
std::string_view key_column,
std::string_view reference_sequence_string,
std::string_view where_clause,
std::string_view order_by_clause
);

ReadSequence& reserveRead();

void insertInsertion(size_t row_id, const std::string& insertion_and_position);

void interpret(const std::vector<std::optional<std::string>>& genomes);
void interpret(const std::vector<ReadSequence>& reads);

void finalize();
};
Expand Down
2 changes: 1 addition & 1 deletion include/silo/storage/unaligned_sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class access;
} // namespace boost::serialization

namespace silo {
class ZstdFastaTableReader;
class ZstdTableReader;

/// Holds information where to read unaligned sequences for a
/// segment (= the sequence of a particular name) in one partition.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

#include <zstd.h>

#include "silo/zstdfasta/zstd_context.h"
#include "silo/zstdfasta/zstd_dictionary.h"
#include "silo/zstd/zstd_context.h"
#include "silo/zstd/zstd_dictionary.h"

namespace silo {

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ namespace silo {
class ZstdDecompressor {
ZstdDDictionary zstd_dictionary;
ZstdDContext zstd_context;
std::string buffer;

public:
explicit ZstdDecompressor(std::string_view dictionary_string);

std::string_view decompress(const std::string& input);
void decompress(const std::string& input, std::string& buffer);

std::string_view decompress(const char* input_data, size_t input_length);
void decompress(const char* input_data, size_t input_length, std::string& buffer);
};

} // namespace silo
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,31 @@

#include <string>

#include "silo/file_reader/file_reader.h"

namespace duckdb {
struct Connection;
}

namespace silo {
class ZstdFastaReader;
class ZstdFastaTableReader;
class FastaReader;

class ZstdFastaTable {
class ZstdTable {
duckdb::Connection& connection;
std::string table_name;
std::string_view compression_dict;

ZstdFastaTable(
ZstdTable(
duckdb::Connection& connection,
std::string table_name,
std::string_view compression_dict
);

public:
ZstdFastaTableReader getReader(std::string_view where_clause, std::string_view order_by_clause);

static ZstdFastaTable generate(
duckdb::Connection& connection,
const std::string& table_name,
ZstdFastaReader& file_reader,
std::string_view reference_sequence
);

static ZstdFastaTable generate(
static ZstdTable generate(
duckdb::Connection& connection,
const std::string& table_name,
FastaReader& file_reader,
file_reader::FileReader& file_reader,
std::string_view reference_sequence
);
};
Expand Down
Loading

0 comments on commit 0867963

Please sign in to comment.