diff --git a/include/silo/common/string_utils.h b/include/silo/common/string_utils.h index 7bbc89354..28ff26410 100644 --- a/include/silo/common/string_utils.h +++ b/include/silo/common/string_utils.h @@ -12,4 +12,17 @@ std::string removeSymbol(const std::string& value, char symbol); std::vector slice(const std::vector& elements, size_t start, size_t end); +std::vector prepend( + const std::string& prefix, + const std::vector& elements +); + +std::vector tie( + const std::string& prefix, + const std::vector& elements1, + const std::string& delimiter, + const std::vector& elements2, + const std::string& suffix +); + } // namespace silo diff --git a/include/silo/config/database_config.h b/include/silo/config/database_config.h index ec995b46d..ecf043ab5 100644 --- a/include/silo/config/database_config.h +++ b/include/silo/config/database_config.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -31,8 +32,6 @@ class DatabaseSchema { std::string primary_key; std::optional date_to_sort_by; std::optional partition_by; - - [[nodiscard]] std::string getStrictOrderByClause() const; }; class DatabaseConfig { @@ -52,6 +51,14 @@ class DatabaseConfigReader { } // namespace silo::config +template <> +struct std::less { + bool operator()( + const silo::config::DatabaseMetadata& lhs, + const silo::config::DatabaseMetadata& rhs + ) const; +}; + template <> struct [[maybe_unused]] fmt::formatter : fmt::formatter { [[maybe_unused]] static auto format( diff --git a/include/silo/preprocessing/metadata_info.h b/include/silo/preprocessing/metadata_info.h index e6ce45ce4..4e23dac23 100644 --- a/include/silo/preprocessing/metadata_info.h +++ b/include/silo/preprocessing/metadata_info.h @@ -2,34 +2,37 @@ #include #include -#include #include #include "silo/config/database_config.h" namespace silo::preprocessing { -class PreprocessingDatabase; - class MetadataInfo { - std::unordered_map metadata_selects; - - MetadataInfo(std::unordered_map metadata_selects); - public: - static MetadataInfo validateFromMetadataFile( + static void validateMetadataFile( const std::filesystem::path& metadata_file, const silo::config::DatabaseConfig& database_config ); - static MetadataInfo validateFromNdjsonFile( + static void validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config ); - std::vector getMetadataFields() const; + static std::vector getMetadataFields( + const silo::config::DatabaseConfig& database_config + ); - std::vector getMetadataSelects() const; + static std::vector getMetadataTypes( + const silo::config::DatabaseConfig& database_config + ); + + static std::string getMetadataStruct(const silo::config::DatabaseConfig& database_config); + + static std::vector getMetadataSelects( + const silo::config::DatabaseConfig& database_config + ); }; } // namespace silo::preprocessing diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index 6491963d9..416173ce8 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -14,6 +14,9 @@ namespace preprocessing { class SequenceInfo; class Preprocessor { + std::vector nuc_sequences; + std::vector aa_sequences; + std::vector order_by_fields; config::PreprocessingConfig preprocessing_config; config::DatabaseConfig database_config; PreprocessingDatabase preprocessing_db; @@ -39,24 +42,15 @@ class Preprocessor { void buildEmptyPartitioning(); void createInsertionsTableFromFile( - const std::map& expected_sequences, + const std::vector& expected_sequences, const std::filesystem::path& insertion_file, const std::string& table_name ); void createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name); - void createAlignedPartitionedSequenceViews( - const std::filesystem::path& file_name, - const SequenceInfo& sequence_info, - const std::string& partition_by_select, - const std::string& partition_by_where - ); - void createUnalignedPartitionedSequenceFiles( - const std::filesystem::path& file_name, - const std::string& partition_by_select, - const std::string& partition_by_where - ); + void createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name); + void createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name); void createUnalignedPartitionedSequenceFile( const std::string& seq_name, const std::string& table_sql @@ -72,7 +66,6 @@ class Preprocessor { Database buildDatabase( const preprocessing::Partitions& partition_descriptor, - const std::string& order_by_clause, const std::filesystem::path& intermediate_results_directory ); diff --git a/include/silo/preprocessing/sequence_info.h b/include/silo/preprocessing/sequence_info.h index 82a6f68e0..3fb4861f6 100644 --- a/include/silo/preprocessing/sequence_info.h +++ b/include/silo/preprocessing/sequence_info.h @@ -19,32 +19,32 @@ namespace preprocessing { class PreprocessingDatabase; class SequenceInfo { - std::vector nuc_sequence_names; - std::vector aa_sequence_names; - public: - explicit SequenceInfo(const silo::ReferenceGenomes& reference_genomes); - - [[nodiscard]] std::vector getAlignedSequenceSelects( + [[nodiscard]] static std::vector getAlignedSequenceSelects( + const silo::ReferenceGenomes& reference_genomes, const PreprocessingDatabase& preprocessing_db - ) const; + ); - static std::string getNucleotideSequenceSelect( + [[nodiscard]] static std::string getNucleotideSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - static std::string getUnalignedSequenceSelect( + [[nodiscard]] static std::string getUnalignedSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - static std::string getAminoAcidSequenceSelect( + [[nodiscard]] static std::string getAminoAcidSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - void validate(duckdb::Connection& connection, const std::filesystem::path& input_filename) const; + static void validateNdjsonFile( + const silo::ReferenceGenomes& reference_genomes, + duckdb::Connection& connection, + const std::filesystem::path& input_filename + ); }; } // namespace preprocessing } // namespace silo \ No newline at end of file diff --git a/src/silo/common/string_utils.cpp b/src/silo/common/string_utils.cpp index 6df995d74..2f377a044 100644 --- a/src/silo/common/string_utils.cpp +++ b/src/silo/common/string_utils.cpp @@ -1,8 +1,11 @@ #include "silo/common/string_utils.h" #include +#include #include +#include + namespace silo { std::vector splitBy(const std::string& value, const std::string_view delimiter) { @@ -40,4 +43,35 @@ std::vector slice(const std::vector& elements, size_t } return sliced_elements; } + +std::vector prepend( + const std::string& prefix, + const std::vector& elements +) { + std::vector output; + output.reserve(elements.size()); + for (const std::string& str : elements) { + output.emplace_back(prefix + str); + } + return output; +} + +std::vector tie( + const std::string& prefix, + const std::vector& elements1, + const std::string& delimiter, + const std::vector& elements2, + const std::string& suffix +) { + assert(elements1.size() == elements2.size()); + std::vector output; + output.reserve(elements1.size()); + for (size_t i = 0; i < elements1.size(); ++i) { + output.emplace_back( + fmt::format("{}{}{}{}{}", prefix, elements1[i], delimiter, elements2[i], suffix) + ); + } + return output; +} + } // namespace silo \ No newline at end of file diff --git a/src/silo/config/database_config.cpp b/src/silo/config/database_config.cpp index cabb859be..48e058a02 100644 --- a/src/silo/config/database_config.cpp +++ b/src/silo/config/database_config.cpp @@ -57,6 +57,13 @@ std::string toString(ValueType type) { } } // namespace +bool std::less::operator()( + const silo::config::DatabaseMetadata& lhs, + const silo::config::DatabaseMetadata& rhs +) const { + return lhs.name < rhs.name; +} + namespace YAML { template <> struct convert { @@ -179,16 +186,6 @@ ColumnType DatabaseMetadata::getColumnType() const { throw std::runtime_error("Did not find metadata with name: " + std::string(name)); } -std::string DatabaseSchema::getStrictOrderByClause() const { - if (date_to_sort_by.has_value()) { - SPDLOG_INFO("preprocessing - produce order by clause with a date to sort by"); - return fmt::format("ORDER BY {}, {}", date_to_sort_by.value(), primary_key); - } - - SPDLOG_INFO("preprocessing - produce order by clause without a date to sort by"); - return fmt::format("ORDER BY {}", primary_key); -} - std::optional DatabaseConfig::getMetadata(const std::string& name) const { auto element = std::find_if( std::begin(schema.metadata), diff --git a/src/silo/preprocessing/metadata_info.cpp b/src/silo/preprocessing/metadata_info.cpp index 9be40c6ac..56e12a996 100644 --- a/src/silo/preprocessing/metadata_info.cpp +++ b/src/silo/preprocessing/metadata_info.cpp @@ -9,83 +9,82 @@ #include "silo/preprocessing/preprocessing_exception.h" namespace { - -std::unordered_map validateFieldsAgainstConfig( - const std::unordered_map& found_metadata_fields, - const silo::config::DatabaseConfig& database_config -) { - std::vector config_metadata_fields; - std::transform( - database_config.schema.metadata.begin(), - database_config.schema.metadata.end(), - std::back_inserter(config_metadata_fields), - [](auto metadata) { return metadata.name; } - ); - - std::unordered_map validated_metadata_fields; - for (const auto& [field_name, access_path] : found_metadata_fields) { - if (std::find(config_metadata_fields.begin(), config_metadata_fields.end(), field_name) != - config_metadata_fields.end()) { - validated_metadata_fields.emplace(field_name, access_path); - } else { - SPDLOG_WARN( - "Metadata field {} ({}), which is contained in the file is not contained in the " - "config.", - field_name, - access_path - ); - } +using silo::config::ValueType; + +std::string toSQLType(ValueType value_type) { + switch (value_type) { + case ValueType::INT: + return "INT4"; + case ValueType::STRING: + case ValueType::PANGOLINEAGE: + return "VARCHAR"; + case ValueType::FLOAT: + return "FLOAT4"; + case ValueType::BOOL: + return "BOOL"; + case ValueType::DATE: + return "DATE"; } - for (const std::string& name : config_metadata_fields) { - if (!validated_metadata_fields.contains(name)) { - throw silo::preprocessing::PreprocessingException(fmt::format( - "The metadata field '{}' which is contained in the database config is " - "not contained in the input.", - name - )); - } - } - - std::string metadata_field_string; - for (const auto& [field_name, select] : validated_metadata_fields) { - metadata_field_string += "'"; - metadata_field_string += field_name; - metadata_field_string += "' with selection '"; - metadata_field_string += select; - metadata_field_string += "',"; - } - SPDLOG_TRACE("Found metadata fields: " + metadata_field_string); - return validated_metadata_fields; } } // namespace namespace silo::preprocessing { -MetadataInfo::MetadataInfo(std::unordered_map metadata_selects) - : metadata_selects(std::move(metadata_selects)) {} - -MetadataInfo MetadataInfo::validateFromMetadataFile( +void MetadataInfo::validateMetadataFile( const std::filesystem::path& metadata_file, const silo::config::DatabaseConfig& database_config ) { duckdb::DuckDB duck_db(nullptr); duckdb::Connection connection(duck_db); // Get the column names (headers) of the table - auto result = - connection.Query(fmt::format("SELECT * FROM '{}' LIMIT 0", metadata_file.string())); + auto result = connection.Query(fmt::format( + "SELECT * FROM read_csv_auto('{}', delim = '\t', header = true) LIMIT 0", + metadata_file.string() + )); - std::unordered_map file_metadata_fields; + if (result->HasError()) { + const std::string error_message = fmt::format( + "Preprocessing exception when retrieving the fields of the " + "metadata file '{}', " + "duckdb threw with error: {}", + metadata_file.string(), + result->GetError() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + + std::set actual_fields; for (size_t idx = 0; idx < result->ColumnCount(); idx++) { - file_metadata_fields[result->ColumnName(idx)] = "\"" + result->ColumnName(idx) + "\""; + actual_fields.emplace(result->ColumnName(idx)); + if (std::find_if(database_config.schema.metadata.begin(), database_config.schema.metadata.end(), [&](const auto& metadata) { + return metadata.name == result->ColumnName(idx); + }) == database_config.schema.metadata.end()) { + SPDLOG_WARN( + "The field '{}' which is contained in the metadata file '{}' is not contained in the " + "database config.", + result->ColumnName(idx), + metadata_file.string() + ); + } } - const std::unordered_map validated_metadata_fields = - validateFieldsAgainstConfig(file_metadata_fields, database_config); - return {validated_metadata_fields}; + for (const auto& field : database_config.schema.metadata) { + if (!actual_fields.contains(field.name)) { + const std::string error_message = fmt::format( + "The field '{}' which is contained in the database config is not contained in the " + "input field '{}'.", + field.name, + metadata_file.string() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + } } -MetadataInfo MetadataInfo::validateFromNdjsonFile( +void MetadataInfo::validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config ) { @@ -93,53 +92,89 @@ MetadataInfo MetadataInfo::validateFromNdjsonFile( duckdb::Connection connection(duck_db); auto result = connection.Query(fmt::format( - "SELECT json_keys(metadata) " - "FROM read_json_auto(\"{}\") LIMIT 1; ", + "SELECT metadata.* " + "FROM read_json_auto(\"{}\") LIMIT 0; ", ndjson_file.string() )); + if (result->HasError()) { - throw silo::preprocessing::PreprocessingException( - "Preprocessing exception when retrieving the field 'metadata', " - "duckdb threw with error: " + + SPDLOG_WARN( + "Preprocessing exception when retrieving the fields of the struct 'metadata' from the " + "metadata ndjson file '{}', " + "duckdb threw with error: {}", + ndjson_file.string(), result->GetError() ); - } - if (result->RowCount() == 0) { - throw silo::preprocessing::PreprocessingException(fmt::format( - "File {} is empty, which must not be empty at this point", ndjson_file.string() - )); - } - if (result->RowCount() > 1) { - throw silo::preprocessing::PreprocessingException( - "Internal exception, expected Row Count=1, actual " + std::to_string(result->RowCount()) - ); + return; } - std::unordered_map metadata_fields_to_validate; - for (const std::string& metadata_field : preprocessing::extractStringListValue(*result, 0, 0)) { - metadata_fields_to_validate[metadata_field] = "metadata.\"" + metadata_field + "\""; + std::set actual_fields; + for (size_t idx = 0; idx < result->ColumnCount(); idx++) { + actual_fields.emplace(result->ColumnName(idx)); + if (std::find_if(database_config.schema.metadata.begin(), database_config.schema.metadata.end(), [&](const auto& metadata) { + return metadata.name == result->ColumnName(idx); + }) == database_config.schema.metadata.end()) { + SPDLOG_WARN( + "The field '{}' which is contained in the metadata file '{}' is not contained in the " + "database config.", + result->ColumnName(idx), + ndjson_file.string() + ); + } } - const std::unordered_map validated_metadata_fields = - validateFieldsAgainstConfig(metadata_fields_to_validate, database_config); + for (const auto& field : database_config.schema.metadata) { + if (!actual_fields.contains(field.name)) { + const std::string error_message = fmt::format( + "The field '{}' which is contained in the database config is not contained in the " + "input field '{}'.", + field.name, + ndjson_file.string() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + } +} - return {validated_metadata_fields}; +std::vector MetadataInfo::getMetadataFields( + const silo::config::DatabaseConfig& database_config +) { + std::vector ret; + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back("\"" + field.name + "\""); + } + return ret; } -std::vector MetadataInfo::getMetadataFields() const { +std::vector MetadataInfo::getMetadataTypes( + const silo::config::DatabaseConfig& database_config +) { std::vector ret; - ret.reserve(metadata_selects.size()); - for (const auto& [field, _] : metadata_selects) { - ret.push_back("\"" + field + "\""); + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format("\"{}\" {}", field.name, toSQLType(field.type))); } return ret; } -std::vector MetadataInfo::getMetadataSelects() const { +std::string MetadataInfo::getMetadataStruct(const silo::config::DatabaseConfig& database_config) { + std::vector ret; + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format("{}: \'{}\'", field.name, toSQLType(field.type))); + } + return fmt::format("{{{}}}", boost::join(ret, ",")); +} + +std::vector MetadataInfo::getMetadataSelects( + const silo::config::DatabaseConfig& database_config +) { std::vector ret; - ret.reserve(metadata_selects.size()); - for (const auto& [field, select] : metadata_selects) { - ret.push_back(fmt::format(R"({} AS "{}")", select, field)); + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format(R"( "metadata"."{0}" AS "{0}")", field.name)); } return ret; } diff --git a/src/silo/preprocessing/metadata_info.test.cpp b/src/silo/preprocessing/metadata_info.test.cpp index db2070af2..e9e21c4e9 100644 --- a/src/silo/preprocessing/metadata_info.test.cpp +++ b/src/silo/preprocessing/metadata_info.test.cpp @@ -25,7 +25,7 @@ TEST( }; EXPECT_THROW( - silo::preprocessing::MetadataInfo::validateFromMetadataFile( + silo::preprocessing::MetadataInfo::validateMetadataFile( "testBaseData/exampleDataset/small_metadata_set.tsv", some_config_with_one_column_not_in_metadata ), @@ -50,10 +50,7 @@ TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::validateFromMetadataFile( - "testBaseData/exampleDataset/small_metadata_set.tsv", valid_config - ) - .getMetadataFields(); + const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("gisaid_epi_isl")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("pango_lineage")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("date")") != fields.end()); @@ -77,10 +74,7 @@ TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::validateFromNdjsonFile( - "testBaseData/exampleDatasetAsNdjson/input_file.ndjson", valid_config - ) - .getMetadataFields(); + const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("gisaid_epi_isl")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("pango_lineage")") != fields.end()); diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index 1b531a476..e8518c99a 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -7,6 +7,7 @@ #include "silo/common/block_timer.h" #include "silo/common/fasta_reader.h" +#include "silo/common/string_utils.h" #include "silo/common/table_reader.h" #include "silo/config/preprocessing_config.h" #include "silo/database.h" @@ -38,7 +39,18 @@ Preprocessor::Preprocessor( database_config(std::move(database_config_)), preprocessing_db(preprocessing_config.getPreprocessingDatabaseLocation(), reference_genomes), reference_genomes_(reference_genomes), - alias_lookup_(std::move(alias_lookup)) {} + alias_lookup_(std::move(alias_lookup)) { + for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { + nuc_sequences.emplace_back(seq_name); + } + for (const auto& [seq_name, _] : reference_genomes_.raw_aa_sequences) { + aa_sequences.emplace_back(seq_name); + } + if (database_config.schema.date_to_sort_by.has_value()) { + order_by_fields.emplace_back(database_config.schema.date_to_sort_by.value()); + } + order_by_fields.emplace_back(database_config.schema.primary_key); +} Database Preprocessor::preprocess() { SPDLOG_INFO( @@ -78,12 +90,12 @@ Database Preprocessor::preprocess() { buildPartitioningTable(); SPDLOG_DEBUG("preprocessing - creating insertions tables for building SILO"); createInsertionsTableFromFile( - reference_genomes_.raw_nucleotide_sequences, + nuc_sequences, preprocessing_config.getNucleotideInsertionsFilename(), NUC_INSERTION_TABLE_NAME ); createInsertionsTableFromFile( - reference_genomes_.raw_aa_sequences, + aa_sequences, preprocessing_config.getAminoAcidInsertionsFilename(), AA_INSERTION_TABLE_NAME ); @@ -94,38 +106,40 @@ Database Preprocessor::preprocess() { const auto partition_descriptor = preprocessing_db.getPartitionDescriptor(); - std::string order_by_clause = database_config.schema.getStrictOrderByClause(); - SPDLOG_INFO("preprocessing - order by clause is {}", order_by_clause); - SPDLOG_INFO("preprocessing - building database"); preprocessing_db.refreshConnection(); return buildDatabase( - partition_descriptor, order_by_clause, preprocessing_config.getIntermediateResultsDirectory() + partition_descriptor, preprocessing_config.getIntermediateResultsDirectory() ); } void Preprocessor::buildTablesFromNdjsonInput(const std::filesystem::path& file_name) { + (void)preprocessing_db.query(fmt::format( + R"-( + CREATE OR REPLACE TABLE metadata_table({}); + )-", + boost::join(MetadataInfo::getMetadataTypes(database_config), ",") + )); + if (!std::filesystem::exists(file_name)) { throw silo::preprocessing::PreprocessingException( fmt::format("The specified input file {} does not exist.", file_name.string()) ); } + if (std::filesystem::is_empty(file_name)) { - throw silo::preprocessing::PreprocessingException( - fmt::format("The specified input file {} is empty.", file_name.string()) + SPDLOG_WARN( + "The specified input file {} is empty. Ignoring its content.", file_name.string() ); + return; } SPDLOG_DEBUG("build - validating metadata file '{}' with config", file_name.string()); - const auto metadata_info = MetadataInfo::validateFromNdjsonFile(file_name, database_config); + MetadataInfo::validateNdjsonFile(file_name, database_config); (void)preprocessing_db.query(fmt::format( - R"-( - CREATE OR REPLACE TABLE metadata_table AS - SELECT {} - FROM '{}'; - )-", - boost::join(metadata_info.getMetadataSelects(), ","), + "INSERT INTO metadata_table BY NAME (SELECT {} FROM read_json_auto('{}'));", + boost::join(MetadataInfo::getMetadataSelects(database_config), ","), file_name.string() )); @@ -151,14 +165,19 @@ void Preprocessor::buildTablesFromNdjsonInput(const std::filesystem::path& file_ } void Preprocessor::buildMetadataTableFromFile(const std::filesystem::path& metadata_filename) { - const MetadataInfo metadata_info = - MetadataInfo::validateFromMetadataFile(metadata_filename, database_config); + (void)preprocessing_db.query(fmt::format( + R"-( + CREATE OR REPLACE TABLE metadata_table({}); + )-", + boost::join(MetadataInfo::getMetadataTypes(database_config), ",") + )); + + MetadataInfo::validateMetadataFile(metadata_filename, database_config); (void)preprocessing_db.query(fmt::format( - "CREATE OR REPLACE TABLE metadata_table AS\n" - "SELECT {}\n" - "FROM '{}';", - boost::join(metadata_info.getMetadataSelects(), ","), + "INSERT INTO metadata_table BY NAME (SELECT {} FROM read_csv_auto('{}', delim = '\t', " + "header = true));", + boost::join(MetadataInfo::getMetadataFields(database_config), ","), metadata_filename.string() )); } @@ -262,7 +281,7 @@ FROM metadata_table; (void)preprocessing_db.query( "CREATE OR REPLACE TABLE partition_key_to_partition AS\n" - "SELECT 0::bigint AS partition_key, 0::bigint AS partition_id;" + "SELECT ''::VARCHAR AS partition_key, 0::bigint AS partition_id;" ); (void)preprocessing_db.query( @@ -274,114 +293,143 @@ FROM metadata_table; void Preprocessor::createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name ) { - const SequenceInfo sequence_info(reference_genomes_); - sequence_info.validate(preprocessing_db.getConnection(), file_name); - - std::string partition_by_select; - std::string partition_by_where; - if (database_config.schema.partition_by.has_value()) { - partition_by_select = "partition_key_to_partition.partition_id AS partition_id"; - partition_by_where = fmt::format( - "WHERE (metadata.\"{0}\" = partition_key_to_partition.partition_key) OR " - "(metadata.\"{0}\" IS NULL AND " - "partition_key_to_partition.partition_key IS NULL)", - database_config.schema.partition_by.value() - ); - } else { - partition_by_select = "0 AS partition_id"; - partition_by_where = ""; + if (std::filesystem::is_empty(file_name)) { } + SequenceInfo::validateNdjsonFile( + reference_genomes_, preprocessing_db.getConnection(), file_name + ); - createUnalignedPartitionedSequenceFiles(file_name, partition_by_select, partition_by_where); + createUnalignedPartitionedSequenceFiles(file_name); - createAlignedPartitionedSequenceViews( - file_name, sequence_info, partition_by_select, partition_by_where - ); + createAlignedPartitionedSequenceViews(file_name); } -void Preprocessor::createAlignedPartitionedSequenceViews( - const std::filesystem::path& file_name, - const SequenceInfo& sequence_info, - const std::string& partition_by_select, - const std::string& partition_by_where -) { - std::string order_by_select = ", metadata.\"" + database_config.schema.primary_key + "\" AS \"" + - database_config.schema.primary_key + "\""; - std::string order_by_fields = ", \"" + database_config.schema.primary_key + "\""; - if (database_config.schema.date_to_sort_by.has_value()) { - order_by_select += ", metadata.\"" + database_config.schema.date_to_sort_by.value() + - "\" AS \"" + database_config.schema.date_to_sort_by.value() + "\""; - order_by_fields += ", \"" + database_config.schema.date_to_sort_by.value() + "\""; +void Preprocessor::createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name) { + std::string file_reader_sql; + if (std::filesystem::is_empty(file_name)) { + file_reader_sql = fmt::format( + "SELECT ''::VARCHAR AS key, ''::VARCHAR AS partition_key, {}, {}, {}, {}, {}", + boost::join(silo::prepend("''::VARCHAR AS nuc_", nuc_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS aa_", aa_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS nuc_insertions_", nuc_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS aa_insertions_", aa_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS order_by_field_", order_by_fields), ", ") + ); + } else { + file_reader_sql = fmt::format( + "SELECT metadata.\"{}\" AS key, {} AS partition_key, {}, {}, {}, {}, {} FROM " + "read_json_auto('{}')", + database_config.schema.primary_key, + database_config.schema.partition_by.has_value() + ? fmt::format("metadata.\"{}\"", database_config.schema.partition_by.value()) + : "''::VARCHAR", + boost::join( + silo::tie( + "alignedNucleotideSequences.\"", nuc_sequences, "\" AS nuc_", nuc_sequences, "" + ), + ", " + ), + boost::join( + silo::tie("alignedAminoAcidSequences.\"", aa_sequences, "\" AS aa_", aa_sequences, ""), + ", " + ), + boost::join( + silo::tie( + "nucleotideInsertions.\"", nuc_sequences, "\" AS nuc_insertions_", nuc_sequences, "" + ), + ", " + ), + boost::join( + silo::tie( + "aminoAcidInsertions.\"", aa_sequences, "\" AS aa_insertions_", aa_sequences, "" + ), + ", " + ), + boost::join( + silo::tie("metadata.\"", order_by_fields, "\" AS order_by_field_", order_by_fields, ""), + ", " + ), + file_name.string() + ); } (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE TABLE sequence_table AS\n" - "SELECT metadata.\"{}\" AS key, {}, nucleotideInsertions, aminoAcidInsertions, {} {} \n" - "FROM '{}', partition_key_to_partition " - "{};", - database_config.schema.primary_key, - boost::join(sequence_info.getAlignedSequenceSelects(preprocessing_db), ","), - partition_by_select, - order_by_select, - file_name.string(), - partition_by_where + "SELECT key, partition_key_to_partition.partition_id AS partition_id, {}, {}, {}, {} \n" + "FROM ({}) file_reader " + "JOIN partition_key_to_partition " + "ON (file_reader.partition_key = partition_key_to_partition.partition_key);", + boost::join( + SequenceInfo::getAlignedSequenceSelects(reference_genomes_, preprocessing_db), ", " + ), + boost::join(prepend("nuc_insertions_", nuc_sequences), ", "), + boost::join(prepend("aa_insertions_", aa_sequences), ", "), + boost::join(prepend("order_by_field_", order_by_fields), ", "), + file_reader_sql )); (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {} AS\n" - "SELECT key, partition_id, nucleotideInsertions.* {} \n" - "FROM sequence_table", + "SELECT key, partition_id, {}, {} \n" + "FROM sequence_table;", NUC_INSERTION_TABLE_NAME, - order_by_fields + boost::join(silo::tie("nuc_insertions_", nuc_sequences, " AS ", nuc_sequences, " "), ","), + boost::join(prepend("order_by_field_", order_by_fields), ",") )); (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {} AS\n" - "SELECT key, partition_id, aminoAcidInsertions.* {} \n" - "FROM sequence_table", + "SELECT key, partition_id, {}, {} \n" + "FROM sequence_table;", AA_INSERTION_TABLE_NAME, - order_by_fields + boost::join(silo::tie("aa_insertions_", aa_sequences, " AS ", aa_sequences, " "), ","), + boost::join(prepend("order_by_field_", order_by_fields), ",") )); for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW nuc_{0} AS\n" - "SELECT key, nuc_{0} AS sequence, partition_id" - "{1}" + "SELECT key, nuc_{0} AS sequence, partition_id, {1} " "FROM sequence_table;", seq_name, - order_by_fields + boost::join(prepend("order_by_field_", order_by_fields), ",") )); } for (const auto& [seq_name, _] : reference_genomes_.raw_aa_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW gene_{0} AS\n" - "SELECT key, gene_{0} AS sequence, partition_id" - "{1}" + "SELECT key, gene_{0} AS sequence, partition_id, {1} " "FROM sequence_table;", seq_name, - order_by_fields + boost::join(prepend("order_by_field_", order_by_fields), ",") )); } } -void Preprocessor::createUnalignedPartitionedSequenceFiles( - const std::filesystem::path& file_name, - const std::string& partition_by_select, - const std::string& partition_by_where -) { +void Preprocessor::createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name) { for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { + const std::string file_reader_sql = + std::filesystem::is_empty(file_name) + ? fmt::format("SELECT ''::VARCHAR AS key, ''::VARCHAR AS unaligned_nuc_{}", seq_name) + : fmt::format( + "SELECT metadata.\"{0}\" AS key, {1} AS partition_key, " + " unalignedNucleotideSequences.\"{2}\" AS unaligned_nuc_{2} " + "FROM read_json_auto('{3}')", + database_config.schema.primary_key, + database_config.schema.partition_by.has_value() + ? fmt::format("metadata.\"{}\"", database_config.schema.partition_by.value()) + : "''::VARCHAR", + seq_name, + file_name.string() + ); const std::string table_sql = fmt::format( - "SELECT metadata.\"{}\" AS key, {}," - "{} \n" - "FROM '{}', partition_key_to_partition " - "{}", - database_config.schema.primary_key, + "SELECT key, {}, partition_key_to_partition.partition_id \n" + "FROM ({}) file_reader " + "JOIN partition_key_to_partition " + "ON (file_reader.partition_key = partition_key_to_partition.partition_key) ", SequenceInfo::getUnalignedSequenceSelect(seq_name, preprocessing_db), - partition_by_select, - file_name.string(), - partition_by_where + file_reader_sql ); createUnalignedPartitionedSequenceFile(seq_name, table_sql); } @@ -403,12 +451,12 @@ void Preprocessor::createUnalignedPartitionedSequenceFile( } void Preprocessor::createInsertionsTableFromFile( - const std::map& expected_sequences, + const std::vector& expected_sequences, const std::filesystem::path& insertion_file, const std::string& table_name ) { std::set expected_sequence_columns; - for (const auto& [sequence_name, _] : expected_sequences) { + for (const auto& sequence_name : expected_sequences) { expected_sequence_columns.emplace(sequence_name); } @@ -442,14 +490,25 @@ void Preprocessor::createInsertionsTableFromFile( } (void)preprocessing_db.query(fmt::format( - "CREATE TABLE {0} as " - "(SELECT ins.\"{1}\" as key, * " - "FROM read_csv('{2}', delim = '\t', header = true," - " columns = {{{3}}}) ins, " + "CREATE OR REPLACE TABLE {0} AS " + "(SELECT ins.\"{1}\" AS key, {2}, {3}, partition_id " + "FROM read_csv('{4}', delim = '\t', header = true," + " columns = {{{5}}}) ins, " "partitioned_metadata " "WHERE ins.\"{1}\" == partitioned_metadata.\"{1}\" );", table_name, database_config.schema.primary_key, + boost::join(silo::tie("ins.\"", expected_sequences, "\" AS ", expected_sequences, " "), ","), + boost::join( + silo::tie( + "partitioned_metadata.\"", + order_by_fields, + "\" AS order_by_field_", + order_by_fields, + " " + ), + "," + ), insertion_file.string(), boost::join(column_structs, ",") )); @@ -503,13 +562,6 @@ void Preprocessor::createPartitionedTableForSequence( const std::filesystem::path& filename, const std::string& table_prefix ) { - std::string order_by_select = ", raw.key AS " + database_config.schema.primary_key; - if (database_config.schema.date_to_sort_by.has_value()) { - order_by_select += ", partitioned_metadata." + - database_config.schema.date_to_sort_by.value() + " AS " + - database_config.schema.date_to_sort_by.value(); - } - const std::string raw_table_name = "raw_" + table_prefix + sequence_name; const std::string table_name = table_prefix + sequence_name; @@ -519,13 +571,17 @@ void Preprocessor::createPartitionedTableForSequence( R"-( CREATE OR REPLACE VIEW {} AS SELECT key, sequence, - partitioned_metadata.partition_id AS partition_id - {} + partitioned_metadata.partition_id AS partition_id, {} FROM {} AS raw RIGHT JOIN partitioned_metadata ON raw.key = partitioned_metadata."{}"; )-", table_name, - order_by_select, + boost::join( + silo::tie( + "partitioned_metadata.", order_by_fields, " AS order_by_field_", order_by_fields, " " + ), + "," + ), raw_table_name, database_config.schema.primary_key )); @@ -533,7 +589,6 @@ void Preprocessor::createPartitionedTableForSequence( Database Preprocessor::buildDatabase( const preprocessing::Partitions& partition_descriptor, - const std::string& order_by_clause, const std::filesystem::path& intermediate_results_directory ) { Database database; @@ -559,18 +614,28 @@ Database Preprocessor::buildDatabase( tasks.run([&]() { SPDLOG_INFO("build - building metadata store in parallel"); - buildMetadataStore(database, partition_descriptor, order_by_clause); + buildMetadataStore( + database, partition_descriptor, "ORDER BY " + boost::join(order_by_fields, ",") + ); SPDLOG_INFO("build - finished metadata store"); }); tasks.run([&]() { SPDLOG_INFO("build - building nucleotide sequence stores"); - buildNucleotideSequenceStore(database, partition_descriptor, order_by_clause); + buildNucleotideSequenceStore( + database, + partition_descriptor, + "ORDER BY " + boost::join(prepend("order_by_field_", order_by_fields), ",") + ); SPDLOG_INFO("build - finished nucleotide sequence stores"); SPDLOG_INFO("build - building amino acid sequence stores"); - buildAminoAcidSequenceStore(database, partition_descriptor, order_by_clause); + buildAminoAcidSequenceStore( + database, + partition_descriptor, + "ORDER BY " + boost::join(prepend("order_by_field_", order_by_fields), ",") + ); SPDLOG_INFO("build - finished amino acid sequence stores"); }); diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 67e729f1b..4c262122a 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -13,14 +13,14 @@ namespace { struct Scenario { - std::string input_directory; + std::filesystem::path input_directory; uint expected_sequence_count; std::string query; nlohmann::json expected_query_result; }; std::string printTestName(const ::testing::TestParamInfo& info) { - std::string name = "Dir_" + info.param.input_directory; + std::string name = "Dir_" + info.param.input_directory.string(); std::replace(name.begin(), name.end(), '/', '_'); return name; } @@ -91,8 +91,8 @@ const Scenario TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD = { }; const Scenario EMPTY_INPUT_TSV = { - .input_directory = "testBaseData/exampleDatasetEmpty/", - .expected_sequence_count = 2, + .input_directory = "testBaseData/emptyInputTsv/", + .expected_sequence_count = 0, .query = R"( { "action": { @@ -108,8 +108,8 @@ const Scenario EMPTY_INPUT_TSV = { }; const Scenario EMPTY_INPUT_NDJSON = { - .input_directory = "testBaseData/exampleDatasetEmpty/", - .expected_sequence_count = 2, + .input_directory = "testBaseData/emptyInputNdjson/", + .expected_sequence_count = 0, .query = R"( { "action": { @@ -140,15 +140,15 @@ INSTANTIATE_TEST_SUITE_P( printTestName ); -TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) { +TEST_P(PreprocessorTestFixture, shouldProcessData) { const auto scenario = GetParam(); + silo::config::PreprocessingConfig config{.input_directory = scenario.input_directory}; - silo::config::PreprocessingConfig config; - config.overwrite(silo::config::YamlConfig(scenario.input_directory + "preprocessing_config.yaml") + config.overwrite(silo::config::YamlConfig(scenario.input_directory / "preprocessing_config.yaml") ); const auto database_config = silo::config::ConfigRepository().getValidatedConfig( - scenario.input_directory + "database_config.yaml" + scenario.input_directory / "database_config.yaml" ); const auto reference_genomes = @@ -163,7 +163,9 @@ TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) { const auto database_info = database.getDatabaseInfo(); - EXPECT_GT(database_info.total_size, 0UL); + if (scenario.expected_sequence_count > 0) { + EXPECT_GT(database_info.total_size, 0UL); + } EXPECT_EQ(database_info.sequence_count, scenario.expected_sequence_count); const silo::query_engine::QueryEngine query_engine(database); diff --git a/src/silo/preprocessing/sequence_info.cpp b/src/silo/preprocessing/sequence_info.cpp index 8200305a7..2620e45d0 100644 --- a/src/silo/preprocessing/sequence_info.cpp +++ b/src/silo/preprocessing/sequence_info.cpp @@ -9,24 +9,18 @@ namespace silo::preprocessing { -SequenceInfo::SequenceInfo(const silo::ReferenceGenomes& reference_genomes) { - for (const auto& [name, sequence] : reference_genomes.raw_nucleotide_sequences) { - nuc_sequence_names.push_back(name); - } - for (const auto& [name, sequence] : reference_genomes.raw_aa_sequences) { - aa_sequence_names.push_back(name); - } -} - std::vector SequenceInfo::getAlignedSequenceSelects( + const silo::ReferenceGenomes& reference_genomes, const PreprocessingDatabase& preprocessing_db -) const { +) { std::vector sequence_selects; - sequence_selects.reserve(nuc_sequence_names.size() + aa_sequence_names.size()); - for (const std::string& name : nuc_sequence_names) { + sequence_selects.reserve( + reference_genomes.nucleotide_sequences.size() + reference_genomes.aa_sequences.size() + ); + for (const auto& [name, _] : reference_genomes.nucleotide_sequences) { sequence_selects.emplace_back(getNucleotideSequenceSelect(name, preprocessing_db)); } - for (const std::string& name : aa_sequence_names) { + for (const auto& [name, _] : reference_genomes.aa_sequences) { sequence_selects.emplace_back(getAminoAcidSequenceSelect(name, preprocessing_db)); } return sequence_selects; @@ -36,8 +30,7 @@ std::string SequenceInfo::getNucleotideSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("alignedNucleotideSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("nuc_{}", seq_name); return fmt::format( "{0} AS nuc_{1}", @@ -52,8 +45,7 @@ std::string SequenceInfo::getUnalignedSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("unalignedNucleotideSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("unaligned_nuc_{}", seq_name); return fmt::format( "{0} AS unaligned_nuc_{1}", preprocessing_db.compress_nucleotide_functions.at(seq_name)->generateSqlStatement( @@ -67,8 +59,7 @@ std::string SequenceInfo::getAminoAcidSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("alignedAminoAcidSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("aa_{}", seq_name); return fmt::format( "{0} AS gene_{1}", @@ -110,10 +101,29 @@ void validateStruct( } } -void SequenceInfo::validate( +void SequenceInfo::validateNdjsonFile( + const silo::ReferenceGenomes& reference_genomes, duckdb::Connection& connection, const std::filesystem::path& input_filename -) const { +) { + if (std::filesystem::is_empty(input_filename)) { + return; + } + + std::vector nuc_sequence_names; + std::transform( + reference_genomes.raw_nucleotide_sequences.begin(), + reference_genomes.raw_nucleotide_sequences.end(), + std::back_inserter(nuc_sequence_names), + [](auto& pair) { return pair.first; } + ); + std::vector aa_sequence_names; + std::transform( + reference_genomes.raw_aa_sequences.begin(), + reference_genomes.raw_aa_sequences.end(), + std::back_inserter(aa_sequence_names), + [](auto& pair) { return pair.first; } + ); auto result = connection.Query(fmt::format( "SELECT json_keys(alignedNucleotideSequences), json_keys(alignedAminoAcidSequences), " "json_keys(unalignedNucleotideSequences), json_keys(nucleotideInsertions), " diff --git a/src/silo/preprocessing/sequence_info.test.cpp b/src/silo/preprocessing/sequence_info.test.cpp index 54874842b..a484d56f1 100644 --- a/src/silo/preprocessing/sequence_info.test.cpp +++ b/src/silo/preprocessing/sequence_info.test.cpp @@ -13,25 +13,23 @@ TEST(SequenceInfo, validatesSuccessfulOnCorrectFile) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); - ASSERT_NO_THROW(sequence_info.validate( - connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" + ASSERT_NO_THROW(SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" )); } TEST(SequenceInfo, failWhenTooManyGenomesInReferences) { const auto reference_genomes = ReferenceGenomes::readFromFile("testBaseData/exampleDataset/reference_genomes.json"); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate( - connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" ), silo::preprocessing::PreprocessingException ); @@ -41,12 +39,13 @@ TEST(SequenceInfo, failWhenTooManyGenomesInJson) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate(connection, "testBaseData/ndjsonFiles/oneline_second_nuc.json.zst"), + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/ndjsonFiles/oneline_second_nuc.json.zst" + ), silo::preprocessing::PreprocessingException ); } @@ -55,12 +54,13 @@ TEST(SequenceInfo, failWhenTooFewAASequencesInJson) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate(connection, "testBaseData/ndjsonFiles/oneline_without_ORF.json.zst"), + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/ndjsonFiles/oneline_without_ORF.json.zst" + ), silo::preprocessing::PreprocessingException ); } \ No newline at end of file diff --git a/testBaseData/emptyInputNdjson/aa_insertions.tsv b/testBaseData/emptyInputNdjson/aa_insertions.tsv new file mode 100644 index 000000000..f1ef77a70 --- /dev/null +++ b/testBaseData/emptyInputNdjson/aa_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl S ORF1a E M N ORF1b ORF3a ORF6 ORF7a ORF7b ORF8 ORF9b diff --git a/testBaseData/emptyInputNdjson/database_config.yaml b/testBaseData/emptyInputNdjson/database_config.yaml index 1da270b08..a3e0b45dd 100644 --- a/testBaseData/emptyInputNdjson/database_config.yaml +++ b/testBaseData/emptyInputNdjson/database_config.yaml @@ -22,10 +22,6 @@ schema: type: int - name: qc_value type: float - - name: nucleotideInsertions - type: insertion - - name: aminoAcidInsertions - type: aaInsertion primaryKey: gisaid_epi_isl dateToSortBy: date partitionBy: pango_lineage \ No newline at end of file diff --git a/testBaseData/emptyInputNdjson/nuc_insertions.tsv b/testBaseData/emptyInputNdjson/nuc_insertions.tsv new file mode 100644 index 000000000..59063c0ea --- /dev/null +++ b/testBaseData/emptyInputNdjson/nuc_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl main testSecondSequence diff --git a/testBaseData/emptyInputTsv/aa_insertions.tsv b/testBaseData/emptyInputTsv/aa_insertions.tsv new file mode 100644 index 000000000..f1ef77a70 --- /dev/null +++ b/testBaseData/emptyInputTsv/aa_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl S ORF1a E M N ORF1b ORF3a ORF6 ORF7a ORF7b ORF8 ORF9b diff --git a/testBaseData/emptyInputTsv/database_config.yaml b/testBaseData/emptyInputTsv/database_config.yaml index 1da270b08..a3e0b45dd 100644 --- a/testBaseData/emptyInputTsv/database_config.yaml +++ b/testBaseData/emptyInputTsv/database_config.yaml @@ -22,10 +22,6 @@ schema: type: int - name: qc_value type: float - - name: nucleotideInsertions - type: insertion - - name: aminoAcidInsertions - type: aaInsertion primaryKey: gisaid_epi_isl dateToSortBy: date partitionBy: pango_lineage \ No newline at end of file diff --git a/testBaseData/emptyInputTsv/nuc_insertions.tsv b/testBaseData/emptyInputTsv/nuc_insertions.tsv new file mode 100644 index 000000000..59063c0ea --- /dev/null +++ b/testBaseData/emptyInputTsv/nuc_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl main testSecondSequence diff --git a/testBaseData/emptyInputTsv/small_metadata_set.tsv b/testBaseData/emptyInputTsv/small_metadata_set.tsv index d55907bf5..da32055fa 100644 --- a/testBaseData/emptyInputTsv/small_metadata_set.tsv +++ b/testBaseData/emptyInputTsv/small_metadata_set.tsv @@ -1,101 +1 @@ gisaid_epi_isl pango_lineage date region country division unsorted_date age qc_value nucleotideInsertions aminoAcidInsertions -EPI_ISL_1408408 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 S:214:EPE -EPI_ISL_1749899 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 -EPI_ISL_2016901 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 -EPI_ISL_1749892 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 -EPI_ISL_1597932 B.1.1.7 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 S:214:EPE -EPI_ISL_1407962 B.1.1.7 Europe Switzerland Solothurn 2021-01-16 55 0.93 -EPI_ISL_1750503 B.1.258.17 2020-12-24 Europe Switzerland Zürich 2021-02-14 56 0.92 -EPI_ISL_1360935 B.1.1.7 2021-03-08 Europe Switzerland Jura 2021-01-03 57 0.91 -EPI_ISL_2019235 B.1.1.7 2021-04-28 Europe Switzerland Basel-Stadt 2021-01-22 58 0.90 -EPI_ISL_1749960 B.1.1.7 2021-04-15 Europe Switzerland Basel-Land 2021-02-03 59 0.89 -EPI_ISL_1361468 B.1.1.7 2021-03-06 Europe Switzerland Zürich 2021-01-20 50 0.98 -EPI_ISL_1408062 B.1.1.7 2021-03-03 Europe Switzerland Valais 2020-11-24 50 0.97 22204:CAGAA -EPI_ISL_1597890 B.1.1.7 2021-03-21 Switzerland Vaud 2021-01-25 51 0.96 22339:GCTGGT -EPI_ISL_1682849 XA.1 2020-12-17 Europe Switzerland Thurgau 2021-01-21 52 0.95 -EPI_ISL_1408805 B.1.221 2020-11-24 Europe Switzerland Schwyz 2020-12-09 53 0.94 -EPI_ISL_1750868 B.1.1.189 2020-12-15 Europe Switzerland Solothurn 2021-01-20 54 0.93 S:214:EPE -EPI_ISL_2019350 B.1.1.7 2021-04-27 Europe Switzerland Valais 2020-12-21 55 0.92 -EPI_ISL_2017036 B.1.1.7 2021-04-23 Europe Switzerland Solothurn 2021-03-09 56 0.91 -EPI_ISL_1599113 B.1.1.39 2020-12-08 Europe Switzerland Zürich 2021-03-05 57 0.90 -EPI_ISL_2214128 B.1.1.7 2021-05-10 Europe Switzerland Geneva 2020-11-13 58 0.89 -EPI_ISL_2408472 B.1.1.7 2021-05-25 Europe Switzerland Obwalden 2021-03-02 59 0.98 -EPI_ISL_830864 B.1.177 2020-10-08 Europe Switzerland Basel-Stadt 2021-03-03 50 0.97 -EPI_ISL_581968 B.1.160 2020-08-17 Europe Switzerland Basel-Stadt 2021-03-25 50 0.96 S:214:EPE -EPI_ISL_2213804 Q.7 2021-05-08 Europe Switzerland Geneva 2021-04-12 51 25701:CCC -EPI_ISL_2405276 B.1.1.7 2021-05-24 Europe Switzerland Vaud 2021-04-28 52 0.94 -EPI_ISL_2213934 B.1.1.7 2021-05-13 Europe Switzerland Geneva 2021-04-23 53 0.93 -EPI_ISL_2213984 B.1.1.7 2021-05-08 Europe Switzerland Geneva 2021-05-09 54 0.92 25701:CCC -EPI_ISL_2574088 B.1.1.7 2021-06-10 Europe Switzerland Sankt Gallen 2021-05-05 55 0.91 25701:CCC -EPI_ISL_2544226 B.1.1.7 2021-06-05 Europe Switzerland Ticino 2021-05-12 56 0.90 -EPI_ISL_2360326 Q.7 2021-05-23 Europe Switzerland Ticino 2021-03-10 57 0.89 -EPI_ISL_2379651 B.1.1.7 2021-05-11 Europe Switzerland Valais 2021-06-01 58 0.98 -EPI_ISL_1036103 B.1.258 2020-12-09 Europe Switzerland Aargau 2021-06-03 59 0.97 -EPI_ISL_931279 B.1.1 2020-10-28 Europe Switzerland Basel-Stadt 2021-05-11 50 0.96 -EPI_ISL_931031 B.1.177 2020-10-22 Europe Switzerland Basel-Stadt 2021-05-10 50 0.95 -EPI_ISL_1273458 B.1.1.7 2021-01-26 Europe Switzerland Basel-Land 2021-05-18 51 0.94 25701:CCC -EPI_ISL_1273715 B.1.160 2021-01-20 Europe Switzerland Basel-Stadt 2021-05-08 52 0.93 -EPI_ISL_737604 B.1.1 2020-12-14 Europe Switzerland Bern 2021-05-14 53 0.92 -EPI_ISL_1129663 B.1.1.7 2020-12-29 Europe Switzerland Bern 2021-05-07 54 0.91 -EPI_ISL_1003629 B.1.1.39 2021-01-25 Europe Switzerland Aargau 2021-05-18 55 0.90 S:214:EPE -EPI_ISL_737715 B.1.177 2020-12-13 Europe Switzerland Bern 2021-05-16 56 0.89 S:247:SGE -EPI_ISL_1003036 B.1.177 2021-01-16 Europe Switzerland Aargau 2021-07-14 57 0.98 5959:TAT -EPI_ISL_899762 B.1.177 2020-12-25 Europe Switzerland Schwyz 2021-07-19 58 0.97 -EPI_ISL_899725 B.1.177 2021-01-12 Europe Switzerland Solothurn 2021-07-14 59 0.96 S:210:IV -EPI_ISL_1195052 B.1.1.7 2021-02-23 Europe Switzerland Solothurn 2021-07-04 50 0.95 -EPI_ISL_1003519 B.1.160.16 2021-01-22 Europe Switzerland 2021-07-29 50 0.94 -EPI_ISL_1003010 B.1.36.35 2021-01-15 Europe Switzerland Solothurn 2021-07-19 51 0.93 -EPI_ISL_1119584 B.1.1 2020-11-04 Europe Switzerland Solothurn 2021-07-05 52 0.92 -EPI_ISL_1002052 B.1 2021-01-15 Europe Switzerland Solothurn 2021-07-15 53 0.91 -EPI_ISL_466942 B.1 2020-03-08 Europe Switzerland Basel-Stadt 2021-05-12 54 0.90 -EPI_ISL_1003849 B.1.160 2021-01-29 Europe Switzerland Neuchâtel 2021-08-05 55 0.89 -EPI_ISL_768148 GD.1 2020-12-24 Europe Switzerland Sankt Gallen 2020-03-16 56 0.98 25701:CCC -EPI_ISL_1080536 B.1.1.7 2021-02-10 Europe Switzerland Basel-Land 2021-08-04 57 0.97 -EPI_ISL_1002156 B.1.221 2021-01-16 Europe Switzerland Basel-Land 2021-02-03 58 0.96 -EPI_ISL_1119315 B.1.1.7 2021-02-14 Europe Switzerland Graubünden 2021-03-18 59 0.95 -EPI_ISL_1004495 B.1.177.44 2021-01-03 Europe Switzerland 2021-04-13 50 0.94 25701:CCC -EPI_ISL_1001920 B.1.177 2021-01-22 Europe Switzerland Bern 2021-04-25 50 0.93 -EPI_ISL_1131102 B.1.160 2021-02-03 Europe Switzerland Zürich 2021-04-13 51 0.92 -EPI_ISL_1003373 B.1.177 2021-01-20 Europe Switzerland Zürich 2021-03-19 52 0.91 -EPI_ISL_721941 B.1.1.70 2020-11-24 Europe Switzerland Zürich 2021-03-15 53 0.90 -EPI_ISL_1130868 B.1.525 2021-01-25 Europe Switzerland Zürich 2020-12-24 54 0.89 25701:CCC -EPI_ISL_1003425 B.1.177 2021-01-21 Europe Switzerland Uri 2021-03-08 55 0.98 -EPI_ISL_737860 B.1.160 2020-12-09 Europe Switzerland Valais 2021-04-28 56 0.97 -EPI_ISL_1001493 B.1.177.44 2021-01-20 Europe Switzerland Vaud 2021-04-15 57 0.96 -EPI_ISL_1260480 B.1.160 2020-12-21 Europe Switzerland Zürich 2021-03-06 58 0.95 -EPI_ISL_1747885 B.1.1.7 2021-03-09 Europe Switzerland Solothurn 2021-03-03 59 0.94 -EPI_ISL_1747752 B.1.1.7 2021-03-05 Europe Switzerland Basel-Land 2021-03-21 50 0.93 -EPI_ISL_1005148 B.1.221 2020-11-13 Europe Switzerland Solothurn 2020-12-17 50 0.92 25701:CCC -EPI_ISL_1748243 B.1.1.7 2021-03-02 Europe Switzerland Solothurn 2020-11-24 0.91 -EPI_ISL_1748215 B.1.1.7 2021-03-03 Europe Switzerland Solothurn 2020-12-15 52 0.90 -EPI_ISL_1748395 B.1.1.7 2021-03-25 Europe Switzerland Basel-Stadt 2021-04-27 53 0.89 -EPI_ISL_1760534 B.1.1.7 2021-04-12 Europe Switzerland Ticino 2021-04-23 54 0.98 -EPI_ISL_2086867 C.36.3 2021-04-28 Europe Switzerland Zürich 2020-12-08 55 0.97 25701:CCC -EPI_ISL_1840634 Q.7 2021-04-23 Europe Switzerland Ticino 2021-05-10 56 0.96 -EPI_ISL_2180995 B.1.1.7 2021-05-09 Europe Switzerland Basel-Stadt 2021-05-25 57 0.95 -EPI_ISL_2181005 B.1.1.7 2021-05-05 Europe Switzerland Basel-Stadt 2020-10-08 58 0.94 -EPI_ISL_2180023 B.1.1.7 2021-05-12 Europe Switzerland Ticino 2020-08-17 59 0.93 25701:CCC -EPI_ISL_2270139 B.1.1.7 2021-03-10 Europe Switzerland Basel-Stadt 2021-05-08 50 0.92 -EPI_ISL_2544452 B.1.1.7 2021-06-01 Europe Switzerland Schwyz 2021-05-24 50 0.91 -EPI_ISL_2544332 B.1.1.7 2021-06-03 Europe Switzerland Bern 2021-05-13 51 0.90 25701:CCC -EPI_ISL_2307766 B.1.1.7 2021-05-11 Europe Switzerland Bern 2021-05-08 52 0.89 -EPI_ISL_2375490 B.1.1.7 2021-05-10 Europe Switzerland Valais 2021-06-10 53 0.98 -EPI_ISL_2374969 B.1.1.7 2021-05-18 Europe Switzerland Aargau 2021-06-05 54 0.97 25701:CCC -EPI_ISL_2307888 B.1.1.7 2021-05-08 Europe Switzerland Solothurn 2021-05-23 55 0.96 -EPI_ISL_2375247 B.1.1.7 2021-05-14 Europe Switzerland Sankt Gallen 2021-05-11 56 25701:CCC -EPI_ISL_2308054 B.1.1.7 2021-05-07 Europe Switzerland Zürich 2020-12-09 57 0.94 -EPI_ISL_2375165 B.1.1.7 2021-05-18 Europe Switzerland Basel-Land 2020-10-28 58 0.93 -EPI_ISL_2375097 B.1.1.7 2021-05-16 Europe Switzerland Basel-Land 2020-10-22 59 0.92 -EPI_ISL_3128737 AY.9.2 2021-07-14 Europe Switzerland Zürich 2021-01-26 50 0.91 -EPI_ISL_3128811 B.1.617.2 2021-07-19 Europe Switzerland Aargau 2021-01-20 50 0.90 -EPI_ISL_3086369 AY.122 2021-07-14 Europe Switzerland Ticino 2020-12-14 51 0.89 25701:CCC -EPI_ISL_3259931 AY.43 2021-07-04 Europe Switzerland Vaud 2020-12-29 52 0.98 S:143:T -EPI_ISL_3267832 AY.43 2021-07-29 Europe Switzerland Bern 2021-01-25 53 0.97 -EPI_ISL_3128796 B.1.617.2 2021-07-19 Europe Switzerland Zürich 2020-12-13 54 0.96 25701:CCC -EPI_ISL_3016465 B.1.1.7 2021-07-05 Europe Switzerland Valais 2021-01-16 0.95 -EPI_ISL_3247294 2021-07-15 Europe Switzerland Basel-Stadt 2020-12-25 56 0.94 -EPI_ISL_3578231 P.1 2021-05-12 Europe Switzerland Zürich 2021-01-12 57 0.93 25701:CCC -EPI_ISL_3465732 AY.43 2021-08-05 Europe Switzerland Vaud 2021-02-23 58 0.92 -EPI_ISL_2367431 B.1 2020-03-16 Europe Switzerland Vaud 2021-01-22 59 0.91 -EPI_ISL_3465556 AY.43 2021-08-04 Europe Switzerland Solothurn 2021-01-15 50 0.90 -EPI_ISL_2359636 B.1.1.189 2021-02-03 Europe Switzerland Vaud 2020-11-04 57 0.89 25701:CCC ORF1a:3602:F diff --git a/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst b/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst index 6df5de35c..f1a3f8aed 100644 Binary files a/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst and b/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst differ