From 0867963c777257098a7b7bbff0006da269d6c69f Mon Sep 17 00:00:00 2001 From: David Gichev Date: Tue, 14 May 2024 12:41:06 +0200 Subject: [PATCH] feat: support SAM files as sequence input and allow partial sequence input with an offset --- include/silo/common/fasta_reader.h | 25 ---- .../fasta_format_exception.h | 2 +- include/silo/file_reader/fasta_reader.h | 17 +++ include/silo/file_reader/file_reader.h | 29 ++++ .../silo/file_reader/sam_format_exception.h | 13 ++ include/silo/file_reader/sam_reader.h | 18 +++ .../preprocessing/preprocessing_database.h | 18 ++- include/silo/preprocessing/preprocessor.h | 15 ++ include/silo/preprocessing/sql_function.h | 2 +- include/silo/storage/sequence_store.h | 36 ++++- .../silo/storage/unaligned_sequence_store.h | 2 +- .../{zstdfasta => zstd}/zstd_compressor.h | 4 +- .../silo/{zstdfasta => zstd}/zstd_context.h | 0 .../{zstdfasta => zstd}/zstd_decompressor.h | 5 +- .../{zstdfasta => zstd}/zstd_dictionary.h | 0 .../zstdfasta_table.h => zstd/zstd_table.h} | 21 +-- .../zstd_table_reader.h} | 8 +- include/silo/zstdfasta/zstdfasta_reader.h | 35 ----- include/silo/zstdfasta/zstdfasta_writer.h | 42 ------ src/silo/common/aa_symbols.cpp | 3 +- src/silo/common/fasta_reader.cpp | 49 ------- src/silo/common/nucleotide_symbols.cpp | 1 - src/silo/database.cpp | 7 +- .../fasta_format_exception.cpp | 4 +- src/silo/file_reader/fasta_reader.cpp | 40 ++++++ .../fasta_reader.test.cpp | 59 ++++---- src/silo/file_reader/file_reader.cpp | 6 + src/silo/file_reader/sam_format_exception.cpp | 11 ++ src/silo/file_reader/sam_reader.cpp | 27 ++++ .../preprocessing/preprocessing_database.cpp | 57 ++++++-- src/silo/preprocessing/preprocessor.cpp | 92 ++++++++---- src/silo/preprocessing/preprocessor.test.cpp | 30 ++++ src/silo/preprocessing/sql_function.cpp | 2 +- src/silo/query_engine/actions/fasta.cpp | 4 +- src/silo/storage/reference_genomes.cpp | 1 - src/silo/storage/sequence_store.cpp | 92 +++++------- src/silo/storage/unaligned_sequence_store.cpp | 1 - .../{zstdfasta => zstd}/zstd_compressor.cpp | 2 +- src/silo/{zstdfasta => zstd}/zstd_context.cpp | 2 +- .../{zstdfasta => zstd}/zstd_decompressor.cpp | 15 +- .../{zstdfasta => zstd}/zstd_dictionary.cpp | 2 +- src/silo/zstd/zstd_table.cpp | 72 ++++++++++ .../zstd_table_reader.cpp} | 45 +++--- src/silo/zstdfasta/zstdfasta_reader.cpp | 82 ----------- src/silo/zstdfasta/zstdfasta_reader.test.cpp | 82 ----------- src/silo/zstdfasta/zstdfasta_table.cpp | 106 -------------- .../zstdfasta/zstdfasta_table_reader.test.cpp | 135 ------------------ src/silo/zstdfasta/zstdfasta_writer.cpp | 79 ---------- src/silo/zstdfasta/zstdfasta_writer.test.cpp | 45 ------ testBaseData/samFiles/aa_insertions.tsv | 3 + testBaseData/samFiles/database_config.yaml | 13 ++ testBaseData/samFiles/gene_someLongGene.fasta | 2 + .../samFiles/gene_someShortGene.fasta | 2 + testBaseData/samFiles/metadata.tsv | 3 + testBaseData/samFiles/nuc_insertions.tsv | 2 + testBaseData/samFiles/nuc_main.sam | 2 + testBaseData/samFiles/nuc_secondSegment.fasta | 0 .../samFiles/preprocessing_config.yaml | 2 + testBaseData/samFiles/reference_genomes.json | 22 +++ testBaseData/samFiles/unaligned_main.sam | 2 + .../samFiles/unaligned_secondSegment.fasta | 0 61 files changed, 614 insertions(+), 884 deletions(-) delete mode 100644 include/silo/common/fasta_reader.h rename include/silo/{common => file_reader}/fasta_format_exception.h (87%) create mode 100644 include/silo/file_reader/fasta_reader.h create mode 100644 include/silo/file_reader/file_reader.h create mode 100644 include/silo/file_reader/sam_format_exception.h create mode 100644 include/silo/file_reader/sam_reader.h rename include/silo/{zstdfasta => zstd}/zstd_compressor.h (85%) rename include/silo/{zstdfasta => zstd}/zstd_context.h (100%) rename include/silo/{zstdfasta => zstd}/zstd_decompressor.h (68%) rename include/silo/{zstdfasta => zstd}/zstd_dictionary.h (100%) rename include/silo/{zstdfasta/zstdfasta_table.h => zstd/zstd_table.h} (51%) rename include/silo/{zstdfasta/zstdfasta_table_reader.h => zstd/zstd_table_reader.h} (91%) delete mode 100644 include/silo/zstdfasta/zstdfasta_reader.h delete mode 100644 include/silo/zstdfasta/zstdfasta_writer.h delete mode 100644 src/silo/common/fasta_reader.cpp rename src/silo/{common => file_reader}/fasta_format_exception.cpp (69%) create mode 100644 src/silo/file_reader/fasta_reader.cpp rename src/silo/{common => file_reader}/fasta_reader.test.cpp (58%) create mode 100644 src/silo/file_reader/file_reader.cpp create mode 100644 src/silo/file_reader/sam_format_exception.cpp create mode 100644 src/silo/file_reader/sam_reader.cpp rename src/silo/{zstdfasta => zstd}/zstd_compressor.cpp (96%) rename src/silo/{zstdfasta => zstd}/zstd_context.cpp (95%) rename src/silo/{zstdfasta => zstd}/zstd_decompressor.cpp (80%) rename src/silo/{zstdfasta => zstd}/zstd_dictionary.cpp (95%) create mode 100644 src/silo/zstd/zstd_table.cpp rename src/silo/{zstdfasta/zstdfasta_table_reader.cpp => zstd/zstd_table_reader.cpp} (73%) delete mode 100644 src/silo/zstdfasta/zstdfasta_reader.cpp delete mode 100644 src/silo/zstdfasta/zstdfasta_reader.test.cpp delete mode 100644 src/silo/zstdfasta/zstdfasta_table.cpp delete mode 100644 src/silo/zstdfasta/zstdfasta_table_reader.test.cpp delete mode 100644 src/silo/zstdfasta/zstdfasta_writer.cpp delete mode 100644 src/silo/zstdfasta/zstdfasta_writer.test.cpp create mode 100644 testBaseData/samFiles/aa_insertions.tsv create mode 100644 testBaseData/samFiles/database_config.yaml create mode 100644 testBaseData/samFiles/gene_someLongGene.fasta create mode 100644 testBaseData/samFiles/gene_someShortGene.fasta create mode 100644 testBaseData/samFiles/metadata.tsv create mode 100644 testBaseData/samFiles/nuc_insertions.tsv create mode 100644 testBaseData/samFiles/nuc_main.sam create mode 100644 testBaseData/samFiles/nuc_secondSegment.fasta create mode 100644 testBaseData/samFiles/preprocessing_config.yaml create mode 100644 testBaseData/samFiles/reference_genomes.json create mode 100644 testBaseData/samFiles/unaligned_main.sam create mode 100644 testBaseData/samFiles/unaligned_secondSegment.fasta diff --git a/include/silo/common/fasta_reader.h b/include/silo/common/fasta_reader.h deleted file mode 100644 index cb626c8c0..000000000 --- a/include/silo/common/fasta_reader.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "silo/common/input_stream_wrapper.h" - -namespace silo { -class FastaReader { - private: - silo::InputStreamWrapper in_file; - - std::optional nextKey(); - - public: - explicit FastaReader(const std::filesystem::path& in_file_name); - - std::optional nextSkipGenome(); - - std::optional next(std::string& genome_buffer); - - void reset(); -}; -} // namespace silo diff --git a/include/silo/common/fasta_format_exception.h b/include/silo/file_reader/fasta_format_exception.h similarity index 87% rename from include/silo/common/fasta_format_exception.h rename to include/silo/file_reader/fasta_format_exception.h index c7b7afb80..3d29ff8fe 100644 --- a/include/silo/common/fasta_format_exception.h +++ b/include/silo/file_reader/fasta_format_exception.h @@ -3,7 +3,7 @@ #include #include -namespace silo { +namespace silo::file_reader { class FastaFormatException : public std::runtime_error { public: diff --git a/include/silo/file_reader/fasta_reader.h b/include/silo/file_reader/fasta_reader.h new file mode 100644 index 000000000..7689686a2 --- /dev/null +++ b/include/silo/file_reader/fasta_reader.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +#include "silo/file_reader/file_reader.h" + +namespace silo::file_reader { + +class FastaReader : public FileReader { + public: + std::optional nextEntry() override; + explicit FastaReader(const std::filesystem::path& in_file_name) + : FileReader(in_file_name) {} +}; +} // namespace silo \ No newline at end of file diff --git a/include/silo/file_reader/file_reader.h b/include/silo/file_reader/file_reader.h new file mode 100644 index 000000000..d39738c7a --- /dev/null +++ b/include/silo/file_reader/file_reader.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +#include "silo/common/input_stream_wrapper.h" + +namespace silo::file_reader { +class FileReader { + protected: + explicit FileReader(const std::filesystem::path& in_file_name) + : in_file(in_file_name){}; + + silo::InputStreamWrapper in_file; + + public: + struct ReadSequence { + std::string key; + uint32_t offset; + std::string sequence; + }; + + virtual std::optional nextEntry() = 0; + + void reset(); + + virtual ~FileReader(){}; +}; +} // namespace silo::file_reader diff --git a/include/silo/file_reader/sam_format_exception.h b/include/silo/file_reader/sam_format_exception.h new file mode 100644 index 000000000..13e4d540d --- /dev/null +++ b/include/silo/file_reader/sam_format_exception.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +namespace silo::file_reader { + +class SamFormatException : public std::runtime_error { + public: + explicit SamFormatException(const std::string& error_message); +}; + +} // namespace silo diff --git a/include/silo/file_reader/sam_reader.h b/include/silo/file_reader/sam_reader.h new file mode 100644 index 000000000..3ee384776 --- /dev/null +++ b/include/silo/file_reader/sam_reader.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include +#include + +#include "silo/common/string_utils.h" +#include "silo/file_reader/fasta_format_exception.h" +#include "silo/common/input_stream_wrapper.h" + +namespace silo::file_reader { +class SamReader : public FileReader { + public: + std::optional nextEntry() override; + explicit SamReader(const std::filesystem::path& in_file_name) + : FileReader(in_file_name) {} +}; +} // namespace silo diff --git a/include/silo/preprocessing/preprocessing_database.h b/include/silo/preprocessing/preprocessing_database.h index 3ca8677e1..61ffcd993 100644 --- a/include/silo/preprocessing/preprocessing_database.h +++ b/include/silo/preprocessing/preprocessing_database.h @@ -9,7 +9,7 @@ namespace silo { -class ZstdFastaTable; +class ZstdTable; class ReferenceGenomes; class CompressSequence; @@ -42,13 +42,25 @@ class PreprocessingDatabase { std::unique_ptr query(std::string sql_query); - ZstdFastaTable generateSequenceTableFromFasta( + ZstdTable generateSequenceTableViaFile( + const std::string& table_name, + const std::string& reference_sequence, + const std::filesystem::path& file_path + ); + + ZstdTable generateSequenceTableFromFasta( + const std::string& table_name, + const std::string& reference_sequence, + const std::string& filename + ); + + ZstdTable generateSequenceTableFromZstdFasta( const std::string& table_name, const std::string& reference_sequence, const std::string& filename ); - ZstdFastaTable generateSequenceTableFromZstdFasta( + ZstdTable generateSequenceTableFromSAM( const std::string& table_name, const std::string& reference_sequence, const std::string& filename diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index 9beb96995..9890885c2 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -7,6 +7,9 @@ #include "silo/preprocessing/preprocessing_database.h" #include "silo/storage/pango_lineage_alias.h" #include "silo/storage/reference_genomes.h" +#include "silo/common/table_reader.h" +#include "silo/storage/sequence_store.h" +#include "silo/zstd/zstd_decompressor.h" namespace silo { class Database; @@ -88,6 +91,18 @@ class Preprocessor { const std::string& order_by_clause ); + template + ColumnFunction createRawReadLambda( + ZstdDecompressor& decompressor, + silo::SequenceStorePartition& sequence_store + ); + + template + ColumnFunction createInsertionLambda( + const std::string& sequence_name, + silo::SequenceStorePartition& sequence_store + ); + template void buildSequenceStore( Database& database, diff --git a/include/silo/preprocessing/sql_function.h b/include/silo/preprocessing/sql_function.h index 5c657dc82..a7fd5b6df 100644 --- a/include/silo/preprocessing/sql_function.h +++ b/include/silo/preprocessing/sql_function.h @@ -9,7 +9,7 @@ #include #include "silo/storage/pango_lineage_alias.h" -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" namespace silo { diff --git a/include/silo/storage/sequence_store.h b/include/silo/storage/sequence_store.h index bb091fbbf..34208b771 100644 --- a/include/silo/storage/sequence_store.h +++ b/include/silo/storage/sequence_store.h @@ -9,12 +9,14 @@ #include #include +#include #include #include "silo/common/aa_symbols.h" #include "silo/common/format_number.h" #include "silo/common/nucleotide_symbols.h" #include "silo/common/symbol_map.h" +#include "silo/common/table_reader.h" #include "silo/storage/insertion_index.h" #include "silo/storage/position.h" @@ -25,7 +27,7 @@ class access; namespace silo { template class Position; -class ZstdFastaTableReader; +class ZstdTableReader; struct SequenceStoreInfo { uint32_t sequence_count; @@ -33,6 +35,19 @@ struct SequenceStoreInfo { size_t n_bitmaps_size; }; +struct ReadSequence { + bool is_valid = false; + std::string sequence = ""; + uint32_t offset; + + ReadSequence(std::string_view _sequence, uint32_t _offset = 0) + : sequence(std::move(_sequence)), + offset(_offset), + is_valid(true) {} + + ReadSequence() {} +}; + template class SequenceStorePartition { friend class boost::serialization::access; @@ -60,7 +75,7 @@ class SequenceStorePartition { uint32_t sequence_count = 0; private: - void fillIndexes(const std::vector>& genomes); + void fillIndexes(const std::vector& reads); void addSymbolsToPositions( size_t position_idx, @@ -68,11 +83,13 @@ class SequenceStorePartition { size_t number_of_sequences ); - void fillNBitmaps(const std::vector>& genomes); + void fillNBitmaps(const std::vector& reads); void optimizeBitmaps(); public: + static constexpr size_t BUFFER_SIZE = 1024; + std::vector lazy_buffer; explicit SequenceStorePartition( const std::vector& reference_sequence ); @@ -86,11 +103,20 @@ class SequenceStorePartition { [[nodiscard]] SequenceStoreInfo getInfo() const; - size_t fill(silo::ZstdFastaTableReader& input); + size_t fill( + duckdb::Connection& connection, + std::string table_name, + std::string_view key_column, + std::string_view reference_sequence_string, + std::string_view where_clause, + std::string_view order_by_clause + ); + + ReadSequence& reserveRead(); void insertInsertion(size_t row_id, const std::string& insertion_and_position); - void interpret(const std::vector>& genomes); + void interpret(const std::vector& reads); void finalize(); }; diff --git a/include/silo/storage/unaligned_sequence_store.h b/include/silo/storage/unaligned_sequence_store.h index 8aa1e568d..4b724032d 100644 --- a/include/silo/storage/unaligned_sequence_store.h +++ b/include/silo/storage/unaligned_sequence_store.h @@ -10,7 +10,7 @@ class access; } // namespace boost::serialization namespace silo { -class ZstdFastaTableReader; +class ZstdTableReader; /// Holds information where to read unaligned sequences for a /// segment (= the sequence of a particular name) in one partition. diff --git a/include/silo/zstdfasta/zstd_compressor.h b/include/silo/zstd/zstd_compressor.h similarity index 85% rename from include/silo/zstdfasta/zstd_compressor.h rename to include/silo/zstd/zstd_compressor.h index 00ab1972c..edebb3f47 100644 --- a/include/silo/zstdfasta/zstd_compressor.h +++ b/include/silo/zstd/zstd_compressor.h @@ -7,8 +7,8 @@ #include -#include "silo/zstdfasta/zstd_context.h" -#include "silo/zstdfasta/zstd_dictionary.h" +#include "silo/zstd/zstd_context.h" +#include "silo/zstd/zstd_dictionary.h" namespace silo { diff --git a/include/silo/zstdfasta/zstd_context.h b/include/silo/zstd/zstd_context.h similarity index 100% rename from include/silo/zstdfasta/zstd_context.h rename to include/silo/zstd/zstd_context.h diff --git a/include/silo/zstdfasta/zstd_decompressor.h b/include/silo/zstd/zstd_decompressor.h similarity index 68% rename from include/silo/zstdfasta/zstd_decompressor.h rename to include/silo/zstd/zstd_decompressor.h index b85d4d1a4..b7b7982e2 100644 --- a/include/silo/zstdfasta/zstd_decompressor.h +++ b/include/silo/zstd/zstd_decompressor.h @@ -13,14 +13,13 @@ namespace silo { class ZstdDecompressor { ZstdDDictionary zstd_dictionary; ZstdDContext zstd_context; - std::string buffer; public: explicit ZstdDecompressor(std::string_view dictionary_string); - std::string_view decompress(const std::string& input); + void decompress(const std::string& input, std::string& buffer); - std::string_view decompress(const char* input_data, size_t input_length); + void decompress(const char* input_data, size_t input_length, std::string& buffer); }; } // namespace silo diff --git a/include/silo/zstdfasta/zstd_dictionary.h b/include/silo/zstd/zstd_dictionary.h similarity index 100% rename from include/silo/zstdfasta/zstd_dictionary.h rename to include/silo/zstd/zstd_dictionary.h diff --git a/include/silo/zstdfasta/zstdfasta_table.h b/include/silo/zstd/zstd_table.h similarity index 51% rename from include/silo/zstdfasta/zstdfasta_table.h rename to include/silo/zstd/zstd_table.h index 4df546cfb..91df4f9d3 100644 --- a/include/silo/zstdfasta/zstdfasta_table.h +++ b/include/silo/zstd/zstd_table.h @@ -2,40 +2,31 @@ #include +#include "silo/file_reader/file_reader.h" + namespace duckdb { struct Connection; } namespace silo { -class ZstdFastaReader; -class ZstdFastaTableReader; class FastaReader; -class ZstdFastaTable { +class ZstdTable { duckdb::Connection& connection; std::string table_name; std::string_view compression_dict; - ZstdFastaTable( + ZstdTable( duckdb::Connection& connection, std::string table_name, std::string_view compression_dict ); public: - ZstdFastaTableReader getReader(std::string_view where_clause, std::string_view order_by_clause); - - static ZstdFastaTable generate( - duckdb::Connection& connection, - const std::string& table_name, - ZstdFastaReader& file_reader, - std::string_view reference_sequence - ); - - static ZstdFastaTable generate( + static ZstdTable generate( duckdb::Connection& connection, const std::string& table_name, - FastaReader& file_reader, + file_reader::FileReader& file_reader, std::string_view reference_sequence ); }; diff --git a/include/silo/zstdfasta/zstdfasta_table_reader.h b/include/silo/zstd/zstd_table_reader.h similarity index 91% rename from include/silo/zstdfasta/zstdfasta_table_reader.h rename to include/silo/zstd/zstd_table_reader.h index 227a590a1..d6b67a53d 100644 --- a/include/silo/zstdfasta/zstdfasta_table_reader.h +++ b/include/silo/zstd/zstd_table_reader.h @@ -7,7 +7,7 @@ #include #include -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" namespace duckdb { struct Connection; @@ -18,7 +18,7 @@ struct DataChunk; namespace silo { struct ZstdDecompressor; -class ZstdFastaTableReader { +class ZstdTableReader { private: duckdb::Connection& connection; std::string table_name; @@ -37,7 +37,7 @@ class ZstdFastaTableReader { void advanceRow(); public: - explicit ZstdFastaTableReader( + explicit ZstdTableReader( duckdb::Connection& connection, std::string_view table_name, std::string_view compression_dict, @@ -60,4 +60,4 @@ class ZstdFastaTableReader { size_t lineCount(); }; -} // namespace silo +} // namespace silo \ No newline at end of file diff --git a/include/silo/zstdfasta/zstdfasta_reader.h b/include/silo/zstdfasta/zstdfasta_reader.h deleted file mode 100644 index 4c29d4fdf..000000000 --- a/include/silo/zstdfasta/zstdfasta_reader.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "silo/zstdfasta/zstd_decompressor.h" - -namespace silo { - -class ZstdFastaReader { - private: - std::ifstream in_file; - std::unique_ptr decompressor; - - std::optional nextKey(); - - public: - explicit ZstdFastaReader( - const std::filesystem::path& in_file_name, - std::string_view compression_dict - ); - - std::optional nextSkipGenome(); - - std::optional next(std::string& genome); - - std::optional nextCompressed(std::string& compressed_genome); - - void reset(); -}; -} // namespace silo diff --git a/include/silo/zstdfasta/zstdfasta_writer.h b/include/silo/zstdfasta/zstdfasta_writer.h deleted file mode 100644 index 4776feb29..000000000 --- a/include/silo/zstdfasta/zstdfasta_writer.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "zstd_compressor.h" - -namespace silo { -class ZstdFastaWriter { - private: - std::ofstream outStream; - std::unique_ptr compressor; - std::optional default_sequence; - - public: - explicit ZstdFastaWriter( - const std::filesystem::path& out_file_name, - std::string_view compression_dict - ); - - explicit ZstdFastaWriter( - const std::filesystem::path& out_file_name, - std::string_view compression_dict, - const std::string& default_sequence_ - ); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void write(const std::string& key, const std::string& genome); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void writeRaw(const std::string& key, const std::string& compressed_genome); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void writeRaw(const std::string& key, std::string_view compressed_genome); - - void writeDefault(const std::string& key); -}; -} // namespace silo diff --git a/src/silo/common/aa_symbols.cpp b/src/silo/common/aa_symbols.cpp index 30dc0a423..c2c9b1a93 100644 --- a/src/silo/common/aa_symbols.cpp +++ b/src/silo/common/aa_symbols.cpp @@ -2,7 +2,7 @@ #include -using silo::AminoAcid; +namespace silo { char AminoAcid::symbolToChar(AminoAcid::Symbol symbol) { switch (symbol) { @@ -170,3 +170,4 @@ const silo::SymbolMap> AminoAcid::AMBI {AminoAcid::Symbol::STOP, AminoAcid::Symbol::X}, {AminoAcid::Symbol::X}, }}}; +} // namespace silo \ No newline at end of file diff --git a/src/silo/common/fasta_reader.cpp b/src/silo/common/fasta_reader.cpp deleted file mode 100644 index f7e1e71cc..000000000 --- a/src/silo/common/fasta_reader.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include "silo/common/fasta_reader.h" - -#include -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/common/input_stream_wrapper.h" - -silo::FastaReader::FastaReader(const std::filesystem::path& in_file_name) - : in_file(in_file_name) {} - -std::optional silo::FastaReader::nextKey() { - std::string key_with_prefix; - if (!getline(in_file.getInputStream(), key_with_prefix)) { - return std::nullopt; - } - - if (key_with_prefix.empty() || key_with_prefix.at(0) != '>') { - throw FastaFormatException("Fasta key prefix '>' missing for key: " + key_with_prefix); - } - - return key_with_prefix.substr(1); -} - -std::optional silo::FastaReader::nextSkipGenome() { - auto key = nextKey(); - - in_file.getInputStream().ignore(LONG_MAX, '\n'); - return key; -} - -std::optional silo::FastaReader::next(std::string& genome_buffer) { - auto key = nextKey(); - if (!key) { - return std::nullopt; - } - - if (!getline(in_file.getInputStream(), genome_buffer)) { - throw FastaFormatException("Missing genome sequence in line following key: " + *key); - } - - return key; -} - -void silo::FastaReader::reset() { - in_file.getInputStream().clear(); // clear fail and eof bits - in_file.getInputStream().seekg(0, std::ios::beg); // g pointer back to the start -} diff --git a/src/silo/common/nucleotide_symbols.cpp b/src/silo/common/nucleotide_symbols.cpp index e5a46c9ae..1f7ff14b3 100644 --- a/src/silo/common/nucleotide_symbols.cpp +++ b/src/silo/common/nucleotide_symbols.cpp @@ -154,5 +154,4 @@ const silo::SymbolMap> Nucleotide::A {Nucleotide::Symbol::V}, {Nucleotide::Symbol::N}, }}}; - } // namespace silo \ No newline at end of file diff --git a/src/silo/database.cpp b/src/silo/database.cpp index aba4253a0..12501dc7e 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -38,12 +38,12 @@ #include "silo/common/block_timer.h" #include "silo/common/data_version.h" -#include "silo/common/fasta_reader.h" #include "silo/common/format_number.h" #include "silo/common/nucleotide_symbols.h" #include "silo/config/database_config.h" #include "silo/config/preprocessing_config.h" #include "silo/database_info.h" +#include "silo/file_reader/fasta_reader.h" #include "silo/persistence/exception.h" #include "silo/preprocessing/metadata_info.h" #include "silo/preprocessing/preprocessing_exception.h" @@ -63,9 +63,8 @@ #include "silo/storage/sequence_store.h" #include "silo/storage/serialize_optional.h" #include "silo/storage/unaligned_sequence_store.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_decompressor.h" +#include "silo/zstd/zstd_table.h" namespace silo { diff --git a/src/silo/common/fasta_format_exception.cpp b/src/silo/file_reader/fasta_format_exception.cpp similarity index 69% rename from src/silo/common/fasta_format_exception.cpp rename to src/silo/file_reader/fasta_format_exception.cpp index 6f25754ae..efb703183 100644 --- a/src/silo/common/fasta_format_exception.cpp +++ b/src/silo/file_reader/fasta_format_exception.cpp @@ -1,9 +1,9 @@ -#include "silo/common/fasta_format_exception.h" +#include "silo/file_reader/fasta_format_exception.h" #include #include -namespace silo { +namespace silo::file_reader { FastaFormatException::FastaFormatException(const std::string& error_message) : std::runtime_error(error_message.c_str()) {} diff --git a/src/silo/file_reader/fasta_reader.cpp b/src/silo/file_reader/fasta_reader.cpp new file mode 100644 index 000000000..db9428a77 --- /dev/null +++ b/src/silo/file_reader/fasta_reader.cpp @@ -0,0 +1,40 @@ +#include + +#include "silo/common/string_utils.h" +#include "silo/file_reader/fasta_format_exception.h" +#include "silo/file_reader/fasta_reader.h" + +namespace silo::file_reader { + +std::optional FastaReader::nextEntry() { + std::string data; + if (!getline(in_file.getInputStream(), data)) { + return std::nullopt; + } + + if (data.empty() || data.at(0) != '>') { + throw FastaFormatException("Fasta key prefix '>' missing for key: " + data); + } + + auto parts = splitBy(data.substr(1), "|"); + auto key = parts.front(); + auto fields = std::vector(parts.begin() + 1, parts.end()); + + if (key.empty()) { + throw FastaFormatException("Fasta description not valid, missing id: " + data); + } + + std::string sequence; + + if (!getline(in_file.getInputStream(), sequence)) { + throw FastaFormatException("Missing genome sequence in line following identifier: " + key); + } + + return ReadSequence{ + .key = key, + .offset = fields.empty() ? 0 : static_cast(std::stoi(fields[0])), + .sequence = sequence + }; +} + +} diff --git a/src/silo/common/fasta_reader.test.cpp b/src/silo/file_reader/fasta_reader.test.cpp similarity index 58% rename from src/silo/common/fasta_reader.test.cpp rename to src/silo/file_reader/fasta_reader.test.cpp index f2a0506c9..99e2c9b1d 100644 --- a/src/silo/common/fasta_reader.test.cpp +++ b/src/silo/file_reader/fasta_reader.test.cpp @@ -2,11 +2,12 @@ #include #include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/common/fasta_reader.h" #include "silo/config/preprocessing_config.h" +#include "silo/file_reader/fasta_format_exception.h" +#include "silo/file_reader/fasta_reader.h" + +using silo::file_reader::FastaReader; +using silo::file_reader::FastaFormatException; TEST(FastaReader, shouldReadFastaFile) { const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; @@ -15,22 +16,23 @@ TEST(FastaReader, shouldReadFastaFile) { std::filesystem::path file_path = input_directory; file_path += sequence_filename; - silo::FastaReader under_test(file_path); + FastaReader under_test(file_path); - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); EXPECT_EQ(key, "Key1"); EXPECT_EQ(genome, "ACGT"); - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + genome = entry.value().sequence; EXPECT_EQ(key, "Key2"); EXPECT_EQ(genome, "CGTA"); - key = under_test.next(genome); - EXPECT_FALSE(key != std::nullopt); + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); } TEST(FastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { @@ -40,18 +42,16 @@ TEST(FastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { std::filesystem::path file_path = input_directory; file_path += sequence_filename; - silo::FastaReader under_test(file_path); + FastaReader under_test(file_path); - std::optional key; - std::string genome; - - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); EXPECT_EQ(key, "Key"); EXPECT_EQ(genome, "ACGT"); - key = under_test.next(genome); - EXPECT_FALSE(key != std::nullopt); + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); } TEST(FastaReader, givenDataInWrongFormatThenShouldThrowAnException) { @@ -61,10 +61,9 @@ TEST(FastaReader, givenDataInWrongFormatThenShouldThrowAnException) { std::filesystem::path file_path = input_directory; file_path += sequence_filename; - silo::FastaReader under_test(file_path); + FastaReader under_test(file_path); - std::string genome; - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); + EXPECT_THROW(under_test.nextEntry(), FastaFormatException); } TEST(FastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { @@ -74,15 +73,13 @@ TEST(FastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { std::filesystem::path file_path = input_directory; file_path += sequence_filename; - silo::FastaReader under_test(file_path); - - std::optional key; - std::string genome; + FastaReader under_test(file_path); - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); EXPECT_EQ(key, "Key"); EXPECT_EQ(genome, "ACGT"); - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); + EXPECT_THROW(under_test.nextEntry(), FastaFormatException); } \ No newline at end of file diff --git a/src/silo/file_reader/file_reader.cpp b/src/silo/file_reader/file_reader.cpp new file mode 100644 index 000000000..2836d52c0 --- /dev/null +++ b/src/silo/file_reader/file_reader.cpp @@ -0,0 +1,6 @@ +#include "silo/file_reader/file_reader.h" + +void silo::file_reader::FileReader::reset() { + in_file.getInputStream().clear(); // clear fail and eof bits + in_file.getInputStream().seekg(0, std::ios::beg); // g pointer back to the start +} diff --git a/src/silo/file_reader/sam_format_exception.cpp b/src/silo/file_reader/sam_format_exception.cpp new file mode 100644 index 000000000..c26b00ecd --- /dev/null +++ b/src/silo/file_reader/sam_format_exception.cpp @@ -0,0 +1,11 @@ +#include "silo/file_reader/sam_format_exception.h" + +#include +#include + +namespace silo::file_reader { + +SamFormatException::SamFormatException(const std::string& error_message) + : std::runtime_error(error_message.c_str()) {} + +} // namespace silo \ No newline at end of file diff --git a/src/silo/file_reader/sam_reader.cpp b/src/silo/file_reader/sam_reader.cpp new file mode 100644 index 000000000..0aa359eb4 --- /dev/null +++ b/src/silo/file_reader/sam_reader.cpp @@ -0,0 +1,27 @@ +#include +#include + +#include "silo/file_reader/file_reader.h" +#include "silo/file_reader/sam_reader.h" + +namespace silo::file_reader { + +std::optional SamReader::nextEntry() { + std::string data; + if (!getline(in_file.getInputStream(), data)) { + return std::nullopt; + } + + // optional header data + while (data.at(0) == '@' && getline(in_file.getInputStream(), data)) { + ; + } + + auto parts = splitBy(data, "\t"); + return FileReader::ReadSequence{ + .key = parts.at(0), + .offset = static_cast(std::stoi(parts.at(3))), + .sequence = parts.at(9) + }; +} +} \ No newline at end of file diff --git a/src/silo/preprocessing/preprocessing_database.cpp b/src/silo/preprocessing/preprocessing_database.cpp index 4fcc0fd5f..742f51559 100644 --- a/src/silo/preprocessing/preprocessing_database.cpp +++ b/src/silo/preprocessing/preprocessing_database.cpp @@ -9,15 +9,14 @@ #include #include -#include "silo/common/fasta_reader.h" +#include "silo/file_reader/fasta_reader.h" +#include "silo/file_reader/sam_reader.h" #include "silo/preprocessing/partition.h" #include "silo/preprocessing/preprocessing_exception.h" #include "silo/preprocessing/sql_function.h" #include "silo/storage/reference_genomes.h" -#include "silo/zstdfasta/zstd_compressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_writer.h" +#include "silo/zstd/zstd_compressor.h" +#include "silo/zstd/zstd_table.h" using duckdb::BigIntValue; using duckdb::ListValue; @@ -26,6 +25,9 @@ using duckdb::Value; namespace silo::preprocessing { +constexpr std::string_view FASTA_EXTENSION = "fasta"; +constexpr std::string_view SAM_EXTENSION = "sam"; + PreprocessingDatabase::PreprocessingDatabase( const std::optional& backing_file, const ReferenceGenomes& reference_genomes @@ -109,22 +111,55 @@ preprocessing::Partitions PreprocessingDatabase::getPartitionDescriptor() { return preprocessing::Partitions(partitions); } -ZstdFastaTable PreprocessingDatabase::generateSequenceTableFromFasta( +ZstdTable PreprocessingDatabase::generateSequenceTableViaFile( + const std::string& table_name, + const std::string& reference_sequence, + const std::filesystem::path& file_path +) { + const auto file_stem = file_path.stem().string(); + for (const auto& entry : std::filesystem::directory_iterator(file_path.parent_path())) { + const auto entry_file_name = entry.path().filename().string(); + if (!entry.is_regular_file() || !entry_file_name.starts_with(file_stem)) { + continue; + } + auto extensions = splitBy(entry_file_name, "."); + auto last = extensions.back(); + if (last == "zst" || last == "xz") { + extensions.pop_back(); + last = extensions.back(); + } + if (last == FASTA_EXTENSION) { + return generateSequenceTableFromFasta(table_name, reference_sequence, entry.path()); + } + if (last == SAM_EXTENSION) { + return generateSequenceTableFromSAM(table_name, reference_sequence, entry.path()); + } + } + + throw PreprocessingException(fmt::format( + "Could not find reference file for {}, tried file extensions: .fasta(.zst,.xz), " + ".sam(.zst,.xz)", + file_path.string() + )); +} + +ZstdTable PreprocessingDatabase::generateSequenceTableFromFasta( const std::string& table_name, const std::string& reference_sequence, const std::string& filename ) { - silo::FastaReader fasta_reader(filename); - return ZstdFastaTable::generate(connection, table_name, fasta_reader, reference_sequence); + silo::file_reader::FastaReader fasta_reader(filename); + return ZstdTable::generate(connection, table_name, fasta_reader, reference_sequence); } -ZstdFastaTable PreprocessingDatabase::generateSequenceTableFromZstdFasta( +ZstdTable PreprocessingDatabase::generateSequenceTableFromSAM( const std::string& table_name, const std::string& reference_sequence, const std::string& filename ) { - silo::ZstdFastaReader zstd_fasta_reader(filename, reference_sequence); - return ZstdFastaTable::generate(connection, table_name, zstd_fasta_reader, reference_sequence); + silo::file_reader::SamReader sam_reader(filename); + + return ZstdTable::generate(connection, table_name, sam_reader, reference_sequence); } std::vector extractStringListValue( diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index c027991bc..f0386f5fd 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -6,12 +6,12 @@ #include #include "silo/common/block_timer.h" -#include "silo/common/fasta_reader.h" #include "silo/common/string_utils.h" #include "silo/common/table_reader.h" #include "silo/config/preprocessing_config.h" #include "silo/database.h" #include "silo/database_info.h" +#include "silo/file_reader/fasta_reader.h" #include "silo/preprocessing/metadata_info.h" #include "silo/preprocessing/preprocessing_database.h" #include "silo/preprocessing/preprocessing_exception.h" @@ -20,9 +20,8 @@ #include "silo/preprocessing/validated_ndjson_file.h" #include "silo/storage/reference_genomes.h" #include "silo/storage/unaligned_sequence_store.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_decompressor.h" +#include "silo/zstd/zstd_table.h" namespace silo::preprocessing { @@ -390,7 +389,7 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const ValidatedNdjsonFi for (const auto& prefixed_nuc_name : prefixed_nuc_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {0} AS\n" - "SELECT key, {0} AS sequence, partition_id, {1} " + "SELECT key, struct_pack(\"position\" := 0, sequence := {0}) AS read, partition_id, {1} " "FROM sequence_table;", prefixed_nuc_name, boost::join(prefixed_order_by_fields, ",") @@ -400,7 +399,7 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const ValidatedNdjsonFi for (const auto& prefixed_aa_name : prefixed_aa_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {0} AS\n" - "SELECT key, {0} AS sequence, partition_id, {1} " + "SELECT key, struct_pack(\"position\" := 0, sequence := {0}) AS read, partition_id, {1} " "FROM sequence_table;", prefixed_aa_name, boost::join(prefixed_order_by_fields, ",") @@ -520,16 +519,15 @@ void Preprocessor::createPartitionedSequenceTablesFromSequenceFiles() { .replace_extension(FASTA_EXTENSION) ); - preprocessing_db.generateSequenceTableFromFasta( + preprocessing_db.generateSequenceTableViaFile( "unaligned_tmp", reference_sequence, preprocessing_config.getUnalignedNucFilenameNoExtension(sequence_name) - .replace_extension(FASTA_EXTENSION) ); createUnalignedPartitionedSequenceFile( sequence_name, fmt::format( - "SELECT unaligned_tmp.key AS key, unaligned_tmp.sequence AS unaligned_nuc_{}, " + "SELECT unaligned_tmp.key AS key, unaligned_tmp.read AS unaligned_nuc_{}, " "partitioned_metadata.partition_id AS partition_id " "FROM unaligned_tmp RIGHT JOIN partitioned_metadata " "ON unaligned_tmp.key = partitioned_metadata.\"{}\" ", @@ -560,12 +558,12 @@ void Preprocessor::createPartitionedTableForSequence( fmt::format("{}{}{}", "raw_", SymbolType::PREFIX, sequence_name); const std::string table_name = fmt::format("{}{}", SymbolType::PREFIX, sequence_name); - preprocessing_db.generateSequenceTableFromFasta(raw_table_name, reference_sequence, filename); + preprocessing_db.generateSequenceTableViaFile(raw_table_name, reference_sequence, filename); (void)preprocessing_db.query(fmt::format( R"-( CREATE OR REPLACE VIEW {} AS - SELECT key, sequence, + SELECT key, read, partitioned_metadata.partition_id AS partition_id {} FROM {} AS raw RIGHT JOIN partitioned_metadata ON raw.key = partitioned_metadata."{}"; @@ -682,6 +680,41 @@ void Preprocessor::buildMetadataStore( } } +template +ColumnFunction Preprocessor::createRawReadLambda( + ZstdDecompressor& decompressor, + silo::SequenceStorePartition& sequence_store +) { + return {"read", [&sequence_store, &decompressor](size_t row_id, const duckdb::Value& value) { + ReadSequence& target = sequence_store.reserveRead(); + if (value.IsNull()) { + return; + } + const auto& children = duckdb::StructValue::GetChildren(value); + if (children[1].IsNull()) { + return; + } + decompressor.decompress(children[1].GetValueUnsafe(), target.sequence); + target.offset = children[0].GetValue(); + target.is_valid = true; + }}; +} + +template +silo::ColumnFunction Preprocessor::createInsertionLambda( + const std::string& sequence_name, + silo::SequenceStorePartition& sequence_store +) { + return {sequence_name, [&sequence_store](size_t row_id, const duckdb::Value& value) { + if (value.IsNull()) { + return; + } + for (const auto& child : duckdb::ListValue::GetChildren(value)) { + sequence_store.insertInsertion(row_id, child.GetValue()); + } + }}; +} + template void Preprocessor::buildSequenceStore( Database& database, @@ -707,36 +740,33 @@ void Preprocessor::buildSequenceStore( partition_index ); - auto& sequence_store = database.partitions.at(partition_index) - .template getSequenceStores() - .at(sequence_name); + SequenceStorePartition& sequence_store = + database.partitions.at(partition_index) + .template getSequenceStores() + .at(sequence_name); - silo::ZstdFastaTableReader sequence_input( + ZstdDecompressor decompressor(reference_sequence); + + auto column_function_reads = createRawReadLambda(decompressor, sequence_store); + + silo::TableReader( preprocessing_db.getConnection(), fmt::format("{}{}", SymbolType::PREFIX, sequence_name), - reference_sequence, - "sequence", + "key", + {column_function_reads}, fmt::format("partition_id = {}", partition_index), order_by_clause - ); - sequence_store.fill(sequence_input); + ) + .read(); + + auto column_function_insertions = + createInsertionLambda(sequence_name, sequence_store); - const silo::ColumnFunction column_function{ - sequence_name, - [&sequence_store](size_t row_id, const duckdb::Value& value) { - if (value.IsNull()) { - return; - } - for (const auto& child : duckdb::ListValue::GetChildren(value)) { - sequence_store.insertInsertion(row_id, child.GetValue()); - } - } - }; silo::TableReader( preprocessing_db.getConnection(), getInsertionsTableName(), "key", - {column_function}, + {column_function_insertions}, fmt::format("partition_id = {}", partition_index), order_by_clause ) diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 38f43d008..b4edb1cb9 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -53,6 +53,35 @@ const Scenario FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES = { }])") }; +// First 10 characters of sequence in nuc_main.sam have been removed +// This is to test the offset (set to 10 for both reads) +const Scenario SAM_FILES = { + .input_directory = "testBaseData/samFiles/", + .expected_sequence_count = 2, + .query = R"( + { + "action": { + "type": "FastaAligned", + "sequenceName": ["main"], + "orderByFields": ["accessionVersion"] + }, + "filterExpression": { + "type": "True" + } + } + )", + .expected_query_result = nlohmann::json::parse(R"([ + { + "accessionVersion": "1.1", + "main": "NNNNNNNNNNTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT" + }, + { + "accessionVersion": "1.3", + "main": "NNNNNNNNNNTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT" + } + ])") +}; + const Scenario NDJSON_FILE_WITH_MISSING_SEGMENTS_AND_GENES = { .input_directory = "testBaseData/ndjsonWithNullSequences/", .expected_sequence_count = FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES.expected_sequence_count, @@ -236,6 +265,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES, NDJSON_FILE_WITH_MISSING_SEGMENTS_AND_GENES, + SAM_FILES, NDJSON_WITH_SQL_KEYWORD_AS_FIELD, TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD, NDJSON_WITH_NUMERIC_NAMES, diff --git a/src/silo/preprocessing/sql_function.cpp b/src/silo/preprocessing/sql_function.cpp index b558a16bd..bbc665059 100644 --- a/src/silo/preprocessing/sql_function.cpp +++ b/src/silo/preprocessing/sql_function.cpp @@ -4,7 +4,7 @@ #include "silo/common/pango_lineage.h" #include "silo/storage/reference_genomes.h" -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" using duckdb::Connection; using duckdb::DataChunk; diff --git a/src/silo/query_engine/actions/fasta.cpp b/src/silo/query_engine/actions/fasta.cpp index d96645769..20462302c 100644 --- a/src/silo/query_engine/actions/fasta.cpp +++ b/src/silo/query_engine/actions/fasta.cpp @@ -13,7 +13,7 @@ #include "silo/query_engine/operator_result.h" #include "silo/query_engine/query_parse_exception.h" #include "silo/query_engine/query_result.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_table_reader.h" using silo::common::add1; using silo::common::Range; @@ -112,7 +112,7 @@ void addSequencesFromResultTableToJson( const std::string& sequence_name = sequence_names.at(sequence_idx); const std::string_view compression_dict = database_partition.unaligned_nuc_sequences.at(sequence_name).compression_dictionary; - silo::ZstdFastaTableReader table_reader( + silo::ZstdTableReader table_reader( connection, result_table_name, compression_dict, diff --git a/src/silo/storage/reference_genomes.cpp b/src/silo/storage/reference_genomes.cpp index 7f771fdb3..44ee54bc4 100644 --- a/src/silo/storage/reference_genomes.cpp +++ b/src/silo/storage/reference_genomes.cpp @@ -241,5 +241,4 @@ std::string ReferenceGenomes::vectorToString( template <> std::string ReferenceGenomes::vectorToString(const std::vector& vector ); - } // namespace silo \ No newline at end of file diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index 05ee6bd0d..3a55c3365 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -11,19 +11,21 @@ #include #include +#include "silo/zstd/zstd_decompressor.h" #include "silo/common/aa_symbols.h" +#include "silo/common/table_reader.h" #include "silo/common/nucleotide_symbols.h" #include "silo/common/string_utils.h" #include "silo/common/symbol_map.h" #include "silo/preprocessing/preprocessing_exception.h" #include "silo/storage/position.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" template silo::SequenceStorePartition::SequenceStorePartition( const std::vector& reference_sequence ) : reference_sequence(reference_sequence) { + lazy_buffer.reserve(BUFFER_SIZE); positions.reserve(reference_sequence.size()); for (const auto symbol : reference_sequence) { positions.emplace_back(Position::fromInitiallyFlipped(symbol)); @@ -31,41 +33,14 @@ silo::SequenceStorePartition::SequenceStorePartition( } template -size_t silo::SequenceStorePartition::fill(ZstdFastaTableReader& input) { - static constexpr size_t BUFFER_SIZE = 1024; - - input.loadTable(); - - size_t read_sequences_count = 0; - - std::vector> genome_buffer; - - std::optional key; - std::optional genome; - while (true) { - key = input.next(genome); - if (!key) { - break; - } - genome_buffer.push_back(std::move(genome)); - if (genome_buffer.size() >= BUFFER_SIZE) { - interpret(genome_buffer); - genome_buffer.clear(); - } - - ++read_sequences_count; +silo::ReadSequence& silo::SequenceStorePartition::reserveRead() { + if (lazy_buffer.size() > BUFFER_SIZE) { + interpret(lazy_buffer); + lazy_buffer.clear(); } - interpret(genome_buffer); - const SequenceStoreInfo info_before_optimisation = getInfo(); - optimizeBitmaps(); - - SPDLOG_DEBUG( - "Sequence store partition info after filling it: {}, and after optimising: {}", - info_before_optimisation, - getInfo() - ); - return read_sequences_count; + lazy_buffer.emplace_back(); + return lazy_buffer.back(); } namespace { @@ -112,6 +87,9 @@ void silo::SequenceStorePartition::insertInsertion( template void silo::SequenceStorePartition::finalize() { + interpret(lazy_buffer); + lazy_buffer.clear(); + SPDLOG_DEBUG("Building insertion index"); insertion_index.buildIndex(); @@ -159,9 +137,7 @@ const roaring::Roaring* silo::SequenceStorePartition::getBitmap( } template -void silo::SequenceStorePartition::fillIndexes( - const std::vector>& genomes -) { +void silo::SequenceStorePartition::fillIndexes(const std::vector& reads) { const size_t genome_length = positions.size(); static constexpr int COUNT_SYMBOLS_PER_PROCESSOR = 64; tbb::parallel_for( @@ -169,13 +145,13 @@ void silo::SequenceStorePartition::fillIndexes( [&](const auto& local) { SymbolMap> ids_per_symbol_for_current_position; for (size_t position_idx = local.begin(); position_idx != local.end(); ++position_idx) { - const size_t number_of_sequences = genomes.size(); + const size_t number_of_sequences = reads.size(); for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) { - const auto& genome = genomes[sequence_id]; - if (!genome.has_value()) { + const auto& [is_valid, sequence, offset] = reads[sequence_id]; + if (!is_valid || position_idx < offset || position_idx - offset >= sequence.size()) { continue; } - const char character = genome.value()[position_idx]; + const char character = sequence[position_idx - offset]; const auto symbol = SymbolType::charToSymbol(character); if (!symbol.has_value()) { throw silo::preprocessing::PreprocessingException( @@ -211,34 +187,38 @@ void silo::SequenceStorePartition::addSymbolsToPositions( } template -void silo::SequenceStorePartition::fillNBitmaps( - const std::vector>& genomes +void silo::SequenceStorePartition::fillNBitmaps(const std::vector& reads ) { const size_t genome_length = positions.size(); - missing_symbol_bitmaps.resize(sequence_count + genomes.size()); + missing_symbol_bitmaps.resize(sequence_count + reads.size()); - const tbb::blocked_range range(0, genomes.size()); + const tbb::blocked_range range(0, reads.size()); tbb::parallel_for(range, [&](const decltype(range)& local) { std::vector positions_with_symbol_missing; for (size_t sequence_index = local.begin(); sequence_index != local.end(); ++sequence_index) { - const auto& maybe_genome = genomes[sequence_index]; + const auto& [is_valid, maybe_sequence, offset] = reads[sequence_index]; - if (!maybe_genome.has_value()) { + if (!is_valid) { missing_symbol_bitmaps[sequence_count + sequence_index].addRange(0, genome_length); missing_symbol_bitmaps[sequence_count + sequence_index].runOptimize(); continue; } - const auto& genome = maybe_genome.value(); + missing_symbol_bitmaps[sequence_count + sequence_index].addRange(0, offset); - for (size_t position_idx = 0; position_idx < genome_length; ++position_idx) { - const char character = genome[position_idx]; + for (size_t position_idx = 0; position_idx < maybe_sequence.size(); ++position_idx) { + const char character = maybe_sequence[position_idx]; const auto symbol = SymbolType::charToSymbol(character); if (symbol == SymbolType::SYMBOL_MISSING) { - positions_with_symbol_missing.push_back(position_idx); + positions_with_symbol_missing.push_back(position_idx + offset); } } + + missing_symbol_bitmaps[sequence_count + sequence_index].addRange( + offset + maybe_sequence.size(), genome_length + ); + if (!positions_with_symbol_missing.empty()) { missing_symbol_bitmaps[sequence_count + sequence_index].addMany( positions_with_symbol_missing.size(), positions_with_symbol_missing.data() @@ -272,12 +252,10 @@ void silo::SequenceStorePartition::optimizeBitmaps() { } template -void silo::SequenceStorePartition::interpret( - const std::vector>& genomes -) { - fillIndexes(genomes); - fillNBitmaps(genomes); - sequence_count += genomes.size(); +void silo::SequenceStorePartition::interpret(const std::vector& reads) { + fillIndexes(reads); + fillNBitmaps(reads); + sequence_count += reads.size(); } template diff --git a/src/silo/storage/unaligned_sequence_store.cpp b/src/silo/storage/unaligned_sequence_store.cpp index 88548d695..3a4845659 100644 --- a/src/silo/storage/unaligned_sequence_store.cpp +++ b/src/silo/storage/unaligned_sequence_store.cpp @@ -16,7 +16,6 @@ #include "silo/common/symbol_map.h" #include "silo/persistence/exception.h" #include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" silo::UnalignedSequenceStorePartition::UnalignedSequenceStorePartition( std::string sql_for_reading_file, diff --git a/src/silo/zstdfasta/zstd_compressor.cpp b/src/silo/zstd/zstd_compressor.cpp similarity index 96% rename from src/silo/zstdfasta/zstd_compressor.cpp rename to src/silo/zstd/zstd_compressor.cpp index bf0011aa3..cb63d484b 100644 --- a/src/silo/zstdfasta/zstd_compressor.cpp +++ b/src/silo/zstd/zstd_compressor.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" #include #include diff --git a/src/silo/zstdfasta/zstd_context.cpp b/src/silo/zstd/zstd_context.cpp similarity index 95% rename from src/silo/zstdfasta/zstd_context.cpp rename to src/silo/zstd/zstd_context.cpp index 470beb7b3..ceb716aa7 100644 --- a/src/silo/zstdfasta/zstd_context.cpp +++ b/src/silo/zstd/zstd_context.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_context.h" +#include "silo/zstd/zstd_context.h" #include #include diff --git a/src/silo/zstdfasta/zstd_decompressor.cpp b/src/silo/zstd/zstd_decompressor.cpp similarity index 80% rename from src/silo/zstdfasta/zstd_decompressor.cpp rename to src/silo/zstd/zstd_decompressor.cpp index 74530140c..2acffe7d1 100644 --- a/src/silo/zstdfasta/zstd_decompressor.cpp +++ b/src/silo/zstd/zstd_decompressor.cpp @@ -1,21 +1,26 @@ -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" #include #include #include #include +#include namespace silo { ZstdDecompressor::ZstdDecompressor(std::string_view dictionary_string) : zstd_dictionary(ZstdDDictionary(dictionary_string)) {} -std::string_view ZstdDecompressor::decompress(const std::string& input) { - return decompress(input.data(), input.size()); +void ZstdDecompressor::decompress(const std::string& input, std::string& buffer) { + decompress(input.data(), input.size(), buffer); } -std::string_view ZstdDecompressor::decompress(const char* input_data, size_t input_length) { +void ZstdDecompressor::decompress( + const char* input_data, + size_t input_length, + std::string& buffer +) { const size_t uncompressed_size = ZSTD_getFrameContentSize(input_data, input_length); if (uncompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) { throw std::runtime_error(fmt::format( @@ -49,7 +54,7 @@ std::string_view ZstdDecompressor::decompress(const char* input_data, size_t inp fmt::format("Error '{}' in dependency when decompressing using zstd", error_name) ); } - return {buffer.data(), size_or_error_code}; + assert(uncompressed_size == size_or_error_code); } } // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstd_dictionary.cpp b/src/silo/zstd/zstd_dictionary.cpp similarity index 95% rename from src/silo/zstdfasta/zstd_dictionary.cpp rename to src/silo/zstd/zstd_dictionary.cpp index 78b5faeb3..5174de474 100644 --- a/src/silo/zstdfasta/zstd_dictionary.cpp +++ b/src/silo/zstd/zstd_dictionary.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_dictionary.h" +#include "silo/zstd/zstd_dictionary.h" #include diff --git a/src/silo/zstd/zstd_table.cpp b/src/silo/zstd/zstd_table.cpp new file mode 100644 index 000000000..628fca75f --- /dev/null +++ b/src/silo/zstd/zstd_table.cpp @@ -0,0 +1,72 @@ +#include "silo/zstd/zstd_table.h" + +#include +#include +#include + +#include "silo/preprocessing/preprocessing_exception.h" +#include "silo/zstd/zstd_compressor.h" + +namespace { +void initializeTable(duckdb::Connection& connection, std::string table_name) { + auto return_value = connection.Query(fmt::format("DROP TABLE IF EXISTS {};", table_name)); + if (return_value->HasError()) { + throw silo::preprocessing::PreprocessingException(return_value->GetError()); + } + return_value = connection.Query(fmt::format( + "CREATE TABLE {} (" + " key STRING," + " read STRUCT(position UINTEGER, sequence BLOB)" + ");", + table_name + )); + if (return_value->HasError()) { + throw silo::preprocessing::PreprocessingException(return_value->GetError()); + } +} +} // namespace + +namespace silo { + +using silo::file_reader::FileReader; + +ZstdTable::ZstdTable( + duckdb::Connection& connection, + std::string table_name, + std::string_view compression_dict +) + : connection(connection), + table_name(std::move(table_name)), + compression_dict(compression_dict) {} + +ZstdTable ZstdTable::generate( + duckdb::Connection& connection, + const std::string& table_name, + FileReader& file_reader, + std::string_view reference_sequence +) { + initializeTable(connection, table_name); + std::optional entry; + ZstdCompressor compressor(std::make_shared(reference_sequence, 2)); + duckdb::Appender appender(connection, table_name); + while (true) { + entry = file_reader.nextEntry(); + if (!entry) { + break; + } + const std::string_view compressed = compressor.compress(entry->sequence); + const auto* compressed_data = reinterpret_cast(compressed.data()); + const duckdb::string_t key_value = entry->key; + appender.BeginRow(); + appender.Append(key_value); + appender.Append(duckdb::Value::STRUCT( + {{"\"offset\"", duckdb::Value::UINTEGER(entry.value().offset)}, + {"sequence", duckdb::Value::BLOB(compressed_data, compressed.size())}} + )); + appender.EndRow(); + } + appender.Close(); + return {connection, table_name, reference_sequence}; +} + +} // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table_reader.cpp b/src/silo/zstd/zstd_table_reader.cpp similarity index 73% rename from src/silo/zstdfasta/zstdfasta_table_reader.cpp rename to src/silo/zstd/zstd_table_reader.cpp index 926bece40..c5a7d8da4 100644 --- a/src/silo/zstdfasta/zstdfasta_table_reader.cpp +++ b/src/silo/zstd/zstd_table_reader.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_table_reader.h" #include #include @@ -9,9 +9,9 @@ #include #include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" -silo::ZstdFastaTableReader::ZstdFastaTableReader( +silo::ZstdTableReader::ZstdTableReader( duckdb::Connection& connection, std::string_view table_name, std::string_view compression_dict, @@ -25,10 +25,10 @@ silo::ZstdFastaTableReader::ZstdFastaTableReader( where_clause(where_clause), order_by_clause(order_by_clause), decompressor(std::make_unique(compression_dict)) { - SPDLOG_TRACE("Initializing ZstdFastaTableReader for table {}", table_name); + SPDLOG_TRACE("Initializing ZstdTableReader for table {}", table_name); } -std::optional silo::ZstdFastaTableReader::nextKey() { +std::optional silo::ZstdTableReader::nextKey() { if (!current_chunk) { return std::nullopt; } @@ -36,7 +36,7 @@ std::optional silo::ZstdFastaTableReader::nextKey() { return current_chunk->GetValue(0, current_row).GetValue(); } -std::optional silo::ZstdFastaTableReader::nextSkipGenome() { +std::optional silo::ZstdTableReader::nextSkipGenome() { auto key = nextKey(); if (!key) { @@ -47,7 +47,7 @@ std::optional silo::ZstdFastaTableReader::nextSkipGenome() { return key; } -std::optional silo::ZstdFastaTableReader::nextCompressed( +std::optional silo::ZstdTableReader::nextCompressed( std::optional& compressed_genome ) { auto key = nextKey(); @@ -59,14 +59,19 @@ std::optional silo::ZstdFastaTableReader::nextCompressed( if (value.IsNull()) { compressed_genome = std::nullopt; } else { - compressed_genome = value.GetValueUnsafe(); + const auto& children = duckdb::StructValue::GetChildren(value); + if (children[1].IsNull()) { + compressed_genome = std::nullopt; + return std::nullopt; + } + compressed_genome = children[1].GetValueUnsafe(); } advanceRow(); return key; } -std::optional silo::ZstdFastaTableReader::next(std::optional& genome) { +std::optional silo::ZstdTableReader::next(std::optional& genome) { std::optional compressed_buffer; auto key = nextCompressed(compressed_buffer); @@ -75,7 +80,11 @@ std::optional silo::ZstdFastaTableReader::next(std::optionaldecompress(compressed_buffer.value()); + if (!genome) { // TODO(#230) this is not good. it is doing the opposite of buffering and + // should be removed + genome = std::string(); + } + decompressor->decompress(compressed_buffer.value(), genome.value()); } else { genome = std::nullopt; } @@ -83,7 +92,7 @@ std::optional silo::ZstdFastaTableReader::next(std::optionalsize()) { current_row = 0; @@ -130,7 +139,7 @@ void silo::ZstdFastaTableReader::advanceRow() { } } -void silo::ZstdFastaTableReader::copyTableTo(std::string_view file_name) { +void silo::ZstdTableReader::copyTableTo(std::string_view file_name) { query_result = connection.Query(fmt::format("COPY ({}) to '{}'", getTableQuery(), file_name)); if (query_result->HasError()) { SPDLOG_ERROR("Error when executing SQL " + query_result->GetError()); @@ -138,7 +147,7 @@ void silo::ZstdFastaTableReader::copyTableTo(std::string_view file_name) { } } -void silo::ZstdFastaTableReader::copyTableToPartitioned( +void silo::ZstdTableReader::copyTableToPartitioned( std::string_view file_name, std::string_view partition_key ) { @@ -151,7 +160,7 @@ void silo::ZstdFastaTableReader::copyTableToPartitioned( } } -size_t silo::ZstdFastaTableReader::lineCount() { +size_t silo::ZstdTableReader::lineCount() { query_result = connection.Query(fmt::format("SELECT COUNT(*) FROM ({})", getTableQuery())); if (query_result->HasError()) { SPDLOG_ERROR("Error when executing SQL " + query_result->GetError()); @@ -160,4 +169,4 @@ size_t silo::ZstdFastaTableReader::lineCount() { const duckdb::Value count_value = query_result->GetValue(0, 0); const uint64_t line_count = duckdb::BigIntValue::Get(count_value); return static_cast(line_count); -} +} \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_reader.cpp b/src/silo/zstdfasta/zstdfasta_reader.cpp deleted file mode 100644 index c0b00668a..000000000 --- a/src/silo/zstdfasta/zstdfasta_reader.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_reader.h" - -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/zstdfasta/zstd_decompressor.h" - -silo::ZstdFastaReader::ZstdFastaReader( - const std::filesystem::path& in_file_name, - std::string_view compression_dict -) - : decompressor(std::make_unique(compression_dict)) { - in_file = std::ifstream(in_file_name); - if (!in_file) { - throw std::runtime_error("Could not open file reader for file: " + in_file_name.string()); - } -} - -std::optional silo::ZstdFastaReader::nextKey() { - std::string key_with_prefix; - if (!getline(in_file, key_with_prefix)) { - return std::nullopt; - } - - if (key_with_prefix.empty() || key_with_prefix.at(0) != '>') { - throw FastaFormatException("Fasta key prefix '>' missing for key: " + key_with_prefix); - } - - return key_with_prefix.substr(1); -} - -std::optional silo::ZstdFastaReader::nextSkipGenome() { - auto key = nextKey(); - - if (!key) { - return std::nullopt; - } - - std::string bytestream_length_str; - if (!getline(in_file, bytestream_length_str)) { - throw FastaFormatException("Missing bytestream length in line following key: " + *key); - } - const size_t bytestream_length = std::stoul(bytestream_length_str); - - in_file.ignore(static_cast(bytestream_length)); - return key; -} - -std::optional silo::ZstdFastaReader::nextCompressed(std::string& compressed_genome) { - auto key = nextKey(); - if (!key) { - return std::nullopt; - } - - std::string bytestream_length_str; - if (!getline(in_file, bytestream_length_str)) { - throw FastaFormatException("Missing bytestream length in line following key: " + *key); - } - const size_t bytestream_length = std::stoul(bytestream_length_str); - - compressed_genome.resize(bytestream_length); - in_file.read(compressed_genome.data(), static_cast(compressed_genome.size())); - in_file.ignore(1); - return key; -} - -std::optional silo::ZstdFastaReader::next(std::string& genome) { - std::string compressed_buffer; - auto key = nextCompressed(compressed_buffer); - - if (!key) { - return std::nullopt; - } - genome = decompressor->decompress(compressed_buffer); - return key; -} - -void silo::ZstdFastaReader::reset() { - in_file.clear(); // clear fail and eof bits - in_file.seekg(0, std::ios::beg); // g pointer back to the start -} diff --git a/src/silo/zstdfasta/zstdfasta_reader.test.cpp b/src/silo/zstdfasta/zstdfasta_reader.test.cpp deleted file mode 100644 index 5d78357c2..000000000 --- a/src/silo/zstdfasta/zstdfasta_reader.test.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/config/preprocessing_config.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" - -TEST(ZstdFastaReader, shouldReadFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"no_end_new_line.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaReader, givenDataInWrongFormatThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"wrong_format.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::string genome; - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); -} - -TEST(ZstdFastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"missing_genome.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_THROW(key = under_test.next(genome), silo::FastaFormatException); -} \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table.cpp b/src/silo/zstdfasta/zstdfasta_table.cpp deleted file mode 100644 index 6626f9437..000000000 --- a/src/silo/zstdfasta/zstdfasta_table.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_table.h" - -#include -#include - -#include "silo/common/fasta_reader.h" -#include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstd_compressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" - -namespace { -void initializeTable(duckdb::Connection& connection, std::string table_name) { - auto return_value = connection.Query(fmt::format("DROP TABLE IF EXISTS {};", table_name)); - if (return_value->HasError()) { - throw silo::preprocessing::PreprocessingException(return_value->GetError()); - } - return_value = connection.Query(fmt::format( - "CREATE TABLE {} (" - " key STRING," - " sequence BLOB" - ");", - table_name - )); - if (return_value->HasError()) { - throw silo::preprocessing::PreprocessingException(return_value->GetError()); - } -} -} // namespace - -namespace silo { - -ZstdFastaTable::ZstdFastaTable( - duckdb::Connection& connection, - std::string table_name, - std::string_view compression_dict -) - : connection(connection), - table_name(std::move(table_name)), - compression_dict(compression_dict) {} - -ZstdFastaTable ZstdFastaTable::generate( - duckdb::Connection& connection, - const std::string& table_name, - ZstdFastaReader& file_reader, - std::string_view reference_sequence -) { - initializeTable(connection, table_name); - std::optional key; - std::string compressed; - duckdb::Appender appender(connection, table_name); - while (true) { - key = file_reader.nextCompressed(compressed); - if (key == std::nullopt) { - break; - } - const size_t compressed_size = compressed.size(); - const auto* compressed_data = reinterpret_cast(compressed.data()); - const duckdb::string_t key_value = key.value(); - appender.BeginRow(); - appender.Append(key_value); - appender.Append(duckdb::Value::BLOB(compressed_data, compressed_size)); - appender.EndRow(); - } - appender.Close(); - return {connection, table_name, reference_sequence}; -} - -ZstdFastaTable ZstdFastaTable::generate( - duckdb::Connection& connection, - const std::string& table_name, - FastaReader& file_reader, - std::string_view reference_sequence -) { - initializeTable(connection, table_name); - std::optional key; - std::string uncompressed; - ZstdCompressor compressor(std::make_shared(reference_sequence, 2)); - duckdb::Appender appender(connection, table_name); - while (true) { - key = file_reader.next(uncompressed); - if (key == std::nullopt) { - break; - } - const std::string_view compressed = compressor.compress(uncompressed); - const auto* compressed_data = reinterpret_cast(compressed.data()); - const duckdb::string_t key_value = key.value(); - appender.BeginRow(); - appender.Append(key_value); - appender.Append(duckdb::Value::BLOB(compressed_data, compressed.size())); - appender.EndRow(); - } - appender.Close(); - return {connection, table_name, reference_sequence}; -} - -ZstdFastaTableReader ZstdFastaTable::getReader( - std::string_view where_clause, - std::string_view order_by_clause -) { - return ZstdFastaTableReader( - connection, table_name, compression_dict, "sequence", where_clause, order_by_clause - ); -} - -} // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp b/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp deleted file mode 100644 index 5030b2d5a..000000000 --- a/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include - -#include - -#include "silo/common/fasta_reader.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" - -TEST(ZstdFastaTableReader, correctlyReadsZstdFastaTableFromFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.fasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::FastaReader file_reader(file_path); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test(connection, "test", "ACGT", "sequence", "true", ""); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlyReadsZstdFastaTableFromZstdFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::ZstdFastaReader file_reader(file_path, "ACGT"); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test(connection, "test", "ACGT", "sequence", "true", ""); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlySortsZstdFastaTableFromFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.fasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::FastaReader file_reader(file_path); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test( - connection, "test", "ACGT", "sequence", "true", "ORDER BY key desc" - ); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlySortsZstdFastaTableFromZstdFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::ZstdFastaReader file_reader(file_path, "ACGT"); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test( - connection, "test", "ACGT", "sequence", "true", "ORDER BY key desc" - ); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_writer.cpp b/src/silo/zstdfasta/zstdfasta_writer.cpp deleted file mode 100644 index d0c5a2cfe..000000000 --- a/src/silo/zstdfasta/zstdfasta_writer.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_writer.h" - -#include -#include - -#include - -#include "silo/zstdfasta/zstd_compressor.h" - -silo::ZstdFastaWriter::ZstdFastaWriter( - const std::filesystem::path& out_file, - std::string_view compression_dict -) - : compressor( - std::make_unique(std::make_shared(compression_dict, 2)) - ) { - if (!exists(out_file)) { - SPDLOG_DEBUG("ZSTD Sequence Writer at {} does not yet exist", out_file.string()); - if (!exists(out_file.parent_path())) { - SPDLOG_DEBUG("Parent path {} does not yet exist as well", out_file.string()); - if (!create_directories(out_file.parent_path())) { - SPDLOG_DEBUG("Parent path {} could not be created", out_file.string()); - throw std::runtime_error( - "Could not create zstdwriter for file " + std::string{out_file.string()} - ); - } - } - } - outStream = std::ofstream(out_file.string()); - - if (!outStream) { - throw std::runtime_error( - "Could not create ofstream for file " + std::string{out_file.string()} - ); - } - SPDLOG_DEBUG("ZSTD Sequence Writer at {} successfully created", out_file.string()); -} - -silo::ZstdFastaWriter::ZstdFastaWriter( - const std::filesystem::path& out_file, - std::string_view compression_dict, - const std::string& default_sequence_ -) - : ZstdFastaWriter(out_file, compression_dict) { - default_sequence = std::string(compressor->compress(default_sequence_)); -} - -// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -void silo::ZstdFastaWriter::write(const std::string& key, const std::string& genome) { - const std::string_view compressed = compressor->compress(genome); - - outStream << '>' << key << '\n' << std::to_string(compressed.size()) << '\n'; - outStream.write(compressed.data(), static_cast(compressed.size())); - outStream << '\n'; -} - -void silo::ZstdFastaWriter::writeRaw(const std::string& key, const std::string& compressed_genome) { - outStream << '>' << key << '\n' - << std::to_string(compressed_genome.size()) << '\n' - << compressed_genome << '\n'; -} - -void silo::ZstdFastaWriter::writeRaw(const std::string& key, std::string_view compressed_genome) { - outStream << '>' << key << '\n' - << std::to_string(compressed_genome.size()) << '\n' - << compressed_genome << '\n'; -} - -void silo::ZstdFastaWriter::writeDefault(const std::string& key) { - if (default_sequence == std::nullopt) { - throw std::runtime_error( - "Tried to write default sequence, when none was provided in the ZstdFastaWriter " - "constructor" - ); - } - outStream << '>' << key << '\n' - << std::to_string(default_sequence->size()) << '\n' - << *default_sequence << '\n'; -} diff --git a/src/silo/zstdfasta/zstdfasta_writer.test.cpp b/src/silo/zstdfasta/zstdfasta_writer.test.cpp deleted file mode 100644 index de0fc22f1..000000000 --- a/src/silo/zstdfasta/zstdfasta_writer.test.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include -#include - -#include "silo/config/preprocessing_config.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_writer.h" - -TEST(ZstdFastaWriter, writesCorrectFiles) { - const std::filesystem::path file_path("./testBaseData/tmp/test.fasta"); - - const std::string reference_genome = "ACGTACGTACGTACGT"; - const std::vector> values{ - {"Key1", "ACGTACGTACGTACGT"}, - {"Key2", "ACGTACGTACGTCCGT"}, - {"Key3", "ACGTACGTACGTACGT"}, - {"Key4", "CAGTTCGTACGTACGT"}, - {"Key5", "ACGTACGTACCTACGC"} - }; - - { - silo::ZstdFastaWriter under_test(file_path, reference_genome); - - for (const auto& value : values) { - under_test.write(value.first, value.second); - } - } - - { - silo::ZstdFastaReader reader(file_path, reference_genome); - - std::optional key; - std::string genome; - - for (const auto& value : values) { - key = reader.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, value.first); - EXPECT_EQ(genome, value.second); - } - key = reader.next(genome); - EXPECT_FALSE(key != std::nullopt); - } -} diff --git a/testBaseData/samFiles/aa_insertions.tsv b/testBaseData/samFiles/aa_insertions.tsv new file mode 100644 index 000000000..36ca1315d --- /dev/null +++ b/testBaseData/samFiles/aa_insertions.tsv @@ -0,0 +1,3 @@ +accessionVersion someLongGene someShortGene +1.1 [214:EPE] [] +1.1 [] [] \ No newline at end of file diff --git a/testBaseData/samFiles/database_config.yaml b/testBaseData/samFiles/database_config.yaml new file mode 100644 index 000000000..cc0a6826c --- /dev/null +++ b/testBaseData/samFiles/database_config.yaml @@ -0,0 +1,13 @@ +schema: + instanceName: Test + metadata: + - name: date + type: date + - name: host + type: string + - name: pangoLineage + type: pango_lineage + - name: accessionVersion + type: string + primaryKey: accessionVersion + dateToSortBy: date diff --git a/testBaseData/samFiles/gene_someLongGene.fasta b/testBaseData/samFiles/gene_someLongGene.fasta new file mode 100644 index 000000000..5272ea776 --- /dev/null +++ b/testBaseData/samFiles/gene_someLongGene.fasta @@ -0,0 +1,2 @@ +>1.1 +ACDEFGHIKLMNPQRSTVWYBZX-* \ No newline at end of file diff --git a/testBaseData/samFiles/gene_someShortGene.fasta b/testBaseData/samFiles/gene_someShortGene.fasta new file mode 100644 index 000000000..6aeb8d1d6 --- /dev/null +++ b/testBaseData/samFiles/gene_someShortGene.fasta @@ -0,0 +1,2 @@ +>1.1 +MADS \ No newline at end of file diff --git a/testBaseData/samFiles/metadata.tsv b/testBaseData/samFiles/metadata.tsv new file mode 100644 index 000000000..d1ef65889 --- /dev/null +++ b/testBaseData/samFiles/metadata.tsv @@ -0,0 +1,3 @@ +date host accessionVersion pangoLineage +2002-12-15 google.com 1.1 XBB.1.5 + 1.3 XBB.1.5 \ No newline at end of file diff --git a/testBaseData/samFiles/nuc_insertions.tsv b/testBaseData/samFiles/nuc_insertions.tsv new file mode 100644 index 000000000..5986c3403 --- /dev/null +++ b/testBaseData/samFiles/nuc_insertions.tsv @@ -0,0 +1,2 @@ +accessionVersion main secondSegment +1.1 [] [] diff --git a/testBaseData/samFiles/nuc_main.sam b/testBaseData/samFiles/nuc_main.sam new file mode 100644 index 000000000..b95906123 --- /dev/null +++ b/testBaseData/samFiles/nuc_main.sam @@ -0,0 +1,2 @@ +1.1 99 NC_045512.2 10 60 5S246M = 201 404 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +1.3 99 NC_045512.2 10 60 47S204M = 209 400 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; diff --git a/testBaseData/samFiles/nuc_secondSegment.fasta b/testBaseData/samFiles/nuc_secondSegment.fasta new file mode 100644 index 000000000..e69de29bb diff --git a/testBaseData/samFiles/preprocessing_config.yaml b/testBaseData/samFiles/preprocessing_config.yaml new file mode 100644 index 000000000..139c1112a --- /dev/null +++ b/testBaseData/samFiles/preprocessing_config.yaml @@ -0,0 +1,2 @@ +inputDirectory: "testBaseData/samFiles" +metadataFilename: "metadata.tsv" diff --git a/testBaseData/samFiles/reference_genomes.json b/testBaseData/samFiles/reference_genomes.json new file mode 100644 index 000000000..059643fd9 --- /dev/null +++ b/testBaseData/samFiles/reference_genomes.json @@ -0,0 +1,22 @@ +{ + "nucleotideSequences": [ + { + "name": "main", + "sequence": "ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT" + }, + { + "name": "secondSegment", + "sequence": "AAAAAAAAAAAAAAAA" + } + ], + "genes": [ + { + "name": "someLongGene", + "sequence": "AAAAAAAAAAAAAAAAAAAAAAAAA" + }, + { + "name": "someShortGene", + "sequence": "MADS" + } + ] +} diff --git a/testBaseData/samFiles/unaligned_main.sam b/testBaseData/samFiles/unaligned_main.sam new file mode 100644 index 000000000..fe158d060 --- /dev/null +++ b/testBaseData/samFiles/unaligned_main.sam @@ -0,0 +1,2 @@ +1.1 99 NC_045512.2 44 60 5S246M = 201 404 CACA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +1.3 99 NC_045512.2 45 60 47S204M = 209 400 CACA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; diff --git a/testBaseData/samFiles/unaligned_secondSegment.fasta b/testBaseData/samFiles/unaligned_secondSegment.fasta new file mode 100644 index 000000000..e69de29bb