From ad268d5a5293cb788d1a04950b37030af6a6d65c Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Wed, 15 May 2024 16:38:18 +0200 Subject: [PATCH] fix: start with empty files without throwing an error --- include/silo/common/string_utils.h | 13 + include/silo/config/database_config.h | 11 +- include/silo/preprocessing/metadata_info.h | 25 +- include/silo/preprocessing/preprocessor.h | 19 +- include/silo/preprocessing/sequence_info.h | 22 +- src/silo/common/string_utils.cpp | 34 +++ src/silo/config/database_config.cpp | 17 +- src/silo/preprocessing/metadata_info.cpp | 209 +++++++------ src/silo/preprocessing/metadata_info.test.cpp | 12 +- src/silo/preprocessing/preprocessor.cpp | 285 +++++++++++------- src/silo/preprocessing/preprocessor.test.cpp | 24 +- src/silo/preprocessing/sequence_info.cpp | 52 ++-- src/silo/preprocessing/sequence_info.test.cpp | 20 +- .../emptyInputNdjson/aa_insertions.tsv | 1 + .../emptyInputNdjson/database_config.yaml | 4 - .../emptyInputNdjson/nuc_insertions.tsv | 1 + testBaseData/emptyInputTsv/aa_insertions.tsv | 1 + .../emptyInputTsv/database_config.yaml | 4 - testBaseData/emptyInputTsv/nuc_insertions.tsv | 1 + .../emptyInputTsv/small_metadata_set.tsv | 100 ------ .../ndjsonFiles/oneline_second_nuc.json.zst | Bin 10083 -> 10080 bytes 21 files changed, 452 insertions(+), 403 deletions(-) create mode 100644 testBaseData/emptyInputNdjson/aa_insertions.tsv create mode 100644 testBaseData/emptyInputNdjson/nuc_insertions.tsv create mode 100644 testBaseData/emptyInputTsv/aa_insertions.tsv create mode 100644 testBaseData/emptyInputTsv/nuc_insertions.tsv diff --git a/include/silo/common/string_utils.h b/include/silo/common/string_utils.h index 7bbc89354..28ff26410 100644 --- a/include/silo/common/string_utils.h +++ b/include/silo/common/string_utils.h @@ -12,4 +12,17 @@ std::string removeSymbol(const std::string& value, char symbol); std::vector slice(const std::vector& elements, size_t start, size_t end); +std::vector prepend( + const std::string& prefix, + const std::vector& elements +); + +std::vector tie( + const std::string& prefix, + const std::vector& elements1, + const std::string& delimiter, + const std::vector& elements2, + const std::string& suffix +); + } // namespace silo diff --git a/include/silo/config/database_config.h b/include/silo/config/database_config.h index ec995b46d..ecf043ab5 100644 --- a/include/silo/config/database_config.h +++ b/include/silo/config/database_config.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -31,8 +32,6 @@ class DatabaseSchema { std::string primary_key; std::optional date_to_sort_by; std::optional partition_by; - - [[nodiscard]] std::string getStrictOrderByClause() const; }; class DatabaseConfig { @@ -52,6 +51,14 @@ class DatabaseConfigReader { } // namespace silo::config +template <> +struct std::less { + bool operator()( + const silo::config::DatabaseMetadata& lhs, + const silo::config::DatabaseMetadata& rhs + ) const; +}; + template <> struct [[maybe_unused]] fmt::formatter : fmt::formatter { [[maybe_unused]] static auto format( diff --git a/include/silo/preprocessing/metadata_info.h b/include/silo/preprocessing/metadata_info.h index e6ce45ce4..4e23dac23 100644 --- a/include/silo/preprocessing/metadata_info.h +++ b/include/silo/preprocessing/metadata_info.h @@ -2,34 +2,37 @@ #include #include -#include #include #include "silo/config/database_config.h" namespace silo::preprocessing { -class PreprocessingDatabase; - class MetadataInfo { - std::unordered_map metadata_selects; - - MetadataInfo(std::unordered_map metadata_selects); - public: - static MetadataInfo validateFromMetadataFile( + static void validateMetadataFile( const std::filesystem::path& metadata_file, const silo::config::DatabaseConfig& database_config ); - static MetadataInfo validateFromNdjsonFile( + static void validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config ); - std::vector getMetadataFields() const; + static std::vector getMetadataFields( + const silo::config::DatabaseConfig& database_config + ); - std::vector getMetadataSelects() const; + static std::vector getMetadataTypes( + const silo::config::DatabaseConfig& database_config + ); + + static std::string getMetadataStruct(const silo::config::DatabaseConfig& database_config); + + static std::vector getMetadataSelects( + const silo::config::DatabaseConfig& database_config + ); }; } // namespace silo::preprocessing diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index 6491963d9..416173ce8 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -14,6 +14,9 @@ namespace preprocessing { class SequenceInfo; class Preprocessor { + std::vector nuc_sequences; + std::vector aa_sequences; + std::vector order_by_fields; config::PreprocessingConfig preprocessing_config; config::DatabaseConfig database_config; PreprocessingDatabase preprocessing_db; @@ -39,24 +42,15 @@ class Preprocessor { void buildEmptyPartitioning(); void createInsertionsTableFromFile( - const std::map& expected_sequences, + const std::vector& expected_sequences, const std::filesystem::path& insertion_file, const std::string& table_name ); void createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name); - void createAlignedPartitionedSequenceViews( - const std::filesystem::path& file_name, - const SequenceInfo& sequence_info, - const std::string& partition_by_select, - const std::string& partition_by_where - ); - void createUnalignedPartitionedSequenceFiles( - const std::filesystem::path& file_name, - const std::string& partition_by_select, - const std::string& partition_by_where - ); + void createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name); + void createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name); void createUnalignedPartitionedSequenceFile( const std::string& seq_name, const std::string& table_sql @@ -72,7 +66,6 @@ class Preprocessor { Database buildDatabase( const preprocessing::Partitions& partition_descriptor, - const std::string& order_by_clause, const std::filesystem::path& intermediate_results_directory ); diff --git a/include/silo/preprocessing/sequence_info.h b/include/silo/preprocessing/sequence_info.h index 82a6f68e0..3fb4861f6 100644 --- a/include/silo/preprocessing/sequence_info.h +++ b/include/silo/preprocessing/sequence_info.h @@ -19,32 +19,32 @@ namespace preprocessing { class PreprocessingDatabase; class SequenceInfo { - std::vector nuc_sequence_names; - std::vector aa_sequence_names; - public: - explicit SequenceInfo(const silo::ReferenceGenomes& reference_genomes); - - [[nodiscard]] std::vector getAlignedSequenceSelects( + [[nodiscard]] static std::vector getAlignedSequenceSelects( + const silo::ReferenceGenomes& reference_genomes, const PreprocessingDatabase& preprocessing_db - ) const; + ); - static std::string getNucleotideSequenceSelect( + [[nodiscard]] static std::string getNucleotideSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - static std::string getUnalignedSequenceSelect( + [[nodiscard]] static std::string getUnalignedSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - static std::string getAminoAcidSequenceSelect( + [[nodiscard]] static std::string getAminoAcidSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ); - void validate(duckdb::Connection& connection, const std::filesystem::path& input_filename) const; + static void validateNdjsonFile( + const silo::ReferenceGenomes& reference_genomes, + duckdb::Connection& connection, + const std::filesystem::path& input_filename + ); }; } // namespace preprocessing } // namespace silo \ No newline at end of file diff --git a/src/silo/common/string_utils.cpp b/src/silo/common/string_utils.cpp index 6df995d74..2f377a044 100644 --- a/src/silo/common/string_utils.cpp +++ b/src/silo/common/string_utils.cpp @@ -1,8 +1,11 @@ #include "silo/common/string_utils.h" #include +#include #include +#include + namespace silo { std::vector splitBy(const std::string& value, const std::string_view delimiter) { @@ -40,4 +43,35 @@ std::vector slice(const std::vector& elements, size_t } return sliced_elements; } + +std::vector prepend( + const std::string& prefix, + const std::vector& elements +) { + std::vector output; + output.reserve(elements.size()); + for (const std::string& str : elements) { + output.emplace_back(prefix + str); + } + return output; +} + +std::vector tie( + const std::string& prefix, + const std::vector& elements1, + const std::string& delimiter, + const std::vector& elements2, + const std::string& suffix +) { + assert(elements1.size() == elements2.size()); + std::vector output; + output.reserve(elements1.size()); + for (size_t i = 0; i < elements1.size(); ++i) { + output.emplace_back( + fmt::format("{}{}{}{}{}", prefix, elements1[i], delimiter, elements2[i], suffix) + ); + } + return output; +} + } // namespace silo \ No newline at end of file diff --git a/src/silo/config/database_config.cpp b/src/silo/config/database_config.cpp index cabb859be..48e058a02 100644 --- a/src/silo/config/database_config.cpp +++ b/src/silo/config/database_config.cpp @@ -57,6 +57,13 @@ std::string toString(ValueType type) { } } // namespace +bool std::less::operator()( + const silo::config::DatabaseMetadata& lhs, + const silo::config::DatabaseMetadata& rhs +) const { + return lhs.name < rhs.name; +} + namespace YAML { template <> struct convert { @@ -179,16 +186,6 @@ ColumnType DatabaseMetadata::getColumnType() const { throw std::runtime_error("Did not find metadata with name: " + std::string(name)); } -std::string DatabaseSchema::getStrictOrderByClause() const { - if (date_to_sort_by.has_value()) { - SPDLOG_INFO("preprocessing - produce order by clause with a date to sort by"); - return fmt::format("ORDER BY {}, {}", date_to_sort_by.value(), primary_key); - } - - SPDLOG_INFO("preprocessing - produce order by clause without a date to sort by"); - return fmt::format("ORDER BY {}", primary_key); -} - std::optional DatabaseConfig::getMetadata(const std::string& name) const { auto element = std::find_if( std::begin(schema.metadata), diff --git a/src/silo/preprocessing/metadata_info.cpp b/src/silo/preprocessing/metadata_info.cpp index 9be40c6ac..56e12a996 100644 --- a/src/silo/preprocessing/metadata_info.cpp +++ b/src/silo/preprocessing/metadata_info.cpp @@ -9,83 +9,82 @@ #include "silo/preprocessing/preprocessing_exception.h" namespace { - -std::unordered_map validateFieldsAgainstConfig( - const std::unordered_map& found_metadata_fields, - const silo::config::DatabaseConfig& database_config -) { - std::vector config_metadata_fields; - std::transform( - database_config.schema.metadata.begin(), - database_config.schema.metadata.end(), - std::back_inserter(config_metadata_fields), - [](auto metadata) { return metadata.name; } - ); - - std::unordered_map validated_metadata_fields; - for (const auto& [field_name, access_path] : found_metadata_fields) { - if (std::find(config_metadata_fields.begin(), config_metadata_fields.end(), field_name) != - config_metadata_fields.end()) { - validated_metadata_fields.emplace(field_name, access_path); - } else { - SPDLOG_WARN( - "Metadata field {} ({}), which is contained in the file is not contained in the " - "config.", - field_name, - access_path - ); - } +using silo::config::ValueType; + +std::string toSQLType(ValueType value_type) { + switch (value_type) { + case ValueType::INT: + return "INT4"; + case ValueType::STRING: + case ValueType::PANGOLINEAGE: + return "VARCHAR"; + case ValueType::FLOAT: + return "FLOAT4"; + case ValueType::BOOL: + return "BOOL"; + case ValueType::DATE: + return "DATE"; } - for (const std::string& name : config_metadata_fields) { - if (!validated_metadata_fields.contains(name)) { - throw silo::preprocessing::PreprocessingException(fmt::format( - "The metadata field '{}' which is contained in the database config is " - "not contained in the input.", - name - )); - } - } - - std::string metadata_field_string; - for (const auto& [field_name, select] : validated_metadata_fields) { - metadata_field_string += "'"; - metadata_field_string += field_name; - metadata_field_string += "' with selection '"; - metadata_field_string += select; - metadata_field_string += "',"; - } - SPDLOG_TRACE("Found metadata fields: " + metadata_field_string); - return validated_metadata_fields; } } // namespace namespace silo::preprocessing { -MetadataInfo::MetadataInfo(std::unordered_map metadata_selects) - : metadata_selects(std::move(metadata_selects)) {} - -MetadataInfo MetadataInfo::validateFromMetadataFile( +void MetadataInfo::validateMetadataFile( const std::filesystem::path& metadata_file, const silo::config::DatabaseConfig& database_config ) { duckdb::DuckDB duck_db(nullptr); duckdb::Connection connection(duck_db); // Get the column names (headers) of the table - auto result = - connection.Query(fmt::format("SELECT * FROM '{}' LIMIT 0", metadata_file.string())); + auto result = connection.Query(fmt::format( + "SELECT * FROM read_csv_auto('{}', delim = '\t', header = true) LIMIT 0", + metadata_file.string() + )); - std::unordered_map file_metadata_fields; + if (result->HasError()) { + const std::string error_message = fmt::format( + "Preprocessing exception when retrieving the fields of the " + "metadata file '{}', " + "duckdb threw with error: {}", + metadata_file.string(), + result->GetError() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + + std::set actual_fields; for (size_t idx = 0; idx < result->ColumnCount(); idx++) { - file_metadata_fields[result->ColumnName(idx)] = "\"" + result->ColumnName(idx) + "\""; + actual_fields.emplace(result->ColumnName(idx)); + if (std::find_if(database_config.schema.metadata.begin(), database_config.schema.metadata.end(), [&](const auto& metadata) { + return metadata.name == result->ColumnName(idx); + }) == database_config.schema.metadata.end()) { + SPDLOG_WARN( + "The field '{}' which is contained in the metadata file '{}' is not contained in the " + "database config.", + result->ColumnName(idx), + metadata_file.string() + ); + } } - const std::unordered_map validated_metadata_fields = - validateFieldsAgainstConfig(file_metadata_fields, database_config); - return {validated_metadata_fields}; + for (const auto& field : database_config.schema.metadata) { + if (!actual_fields.contains(field.name)) { + const std::string error_message = fmt::format( + "The field '{}' which is contained in the database config is not contained in the " + "input field '{}'.", + field.name, + metadata_file.string() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + } } -MetadataInfo MetadataInfo::validateFromNdjsonFile( +void MetadataInfo::validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config ) { @@ -93,53 +92,89 @@ MetadataInfo MetadataInfo::validateFromNdjsonFile( duckdb::Connection connection(duck_db); auto result = connection.Query(fmt::format( - "SELECT json_keys(metadata) " - "FROM read_json_auto(\"{}\") LIMIT 1; ", + "SELECT metadata.* " + "FROM read_json_auto(\"{}\") LIMIT 0; ", ndjson_file.string() )); + if (result->HasError()) { - throw silo::preprocessing::PreprocessingException( - "Preprocessing exception when retrieving the field 'metadata', " - "duckdb threw with error: " + + SPDLOG_WARN( + "Preprocessing exception when retrieving the fields of the struct 'metadata' from the " + "metadata ndjson file '{}', " + "duckdb threw with error: {}", + ndjson_file.string(), result->GetError() ); - } - if (result->RowCount() == 0) { - throw silo::preprocessing::PreprocessingException(fmt::format( - "File {} is empty, which must not be empty at this point", ndjson_file.string() - )); - } - if (result->RowCount() > 1) { - throw silo::preprocessing::PreprocessingException( - "Internal exception, expected Row Count=1, actual " + std::to_string(result->RowCount()) - ); + return; } - std::unordered_map metadata_fields_to_validate; - for (const std::string& metadata_field : preprocessing::extractStringListValue(*result, 0, 0)) { - metadata_fields_to_validate[metadata_field] = "metadata.\"" + metadata_field + "\""; + std::set actual_fields; + for (size_t idx = 0; idx < result->ColumnCount(); idx++) { + actual_fields.emplace(result->ColumnName(idx)); + if (std::find_if(database_config.schema.metadata.begin(), database_config.schema.metadata.end(), [&](const auto& metadata) { + return metadata.name == result->ColumnName(idx); + }) == database_config.schema.metadata.end()) { + SPDLOG_WARN( + "The field '{}' which is contained in the metadata file '{}' is not contained in the " + "database config.", + result->ColumnName(idx), + ndjson_file.string() + ); + } } - const std::unordered_map validated_metadata_fields = - validateFieldsAgainstConfig(metadata_fields_to_validate, database_config); + for (const auto& field : database_config.schema.metadata) { + if (!actual_fields.contains(field.name)) { + const std::string error_message = fmt::format( + "The field '{}' which is contained in the database config is not contained in the " + "input field '{}'.", + field.name, + ndjson_file.string() + ); + SPDLOG_ERROR(error_message); + throw silo::preprocessing::PreprocessingException(error_message); + } + } +} - return {validated_metadata_fields}; +std::vector MetadataInfo::getMetadataFields( + const silo::config::DatabaseConfig& database_config +) { + std::vector ret; + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back("\"" + field.name + "\""); + } + return ret; } -std::vector MetadataInfo::getMetadataFields() const { +std::vector MetadataInfo::getMetadataTypes( + const silo::config::DatabaseConfig& database_config +) { std::vector ret; - ret.reserve(metadata_selects.size()); - for (const auto& [field, _] : metadata_selects) { - ret.push_back("\"" + field + "\""); + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format("\"{}\" {}", field.name, toSQLType(field.type))); } return ret; } -std::vector MetadataInfo::getMetadataSelects() const { +std::string MetadataInfo::getMetadataStruct(const silo::config::DatabaseConfig& database_config) { + std::vector ret; + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format("{}: \'{}\'", field.name, toSQLType(field.type))); + } + return fmt::format("{{{}}}", boost::join(ret, ",")); +} + +std::vector MetadataInfo::getMetadataSelects( + const silo::config::DatabaseConfig& database_config +) { std::vector ret; - ret.reserve(metadata_selects.size()); - for (const auto& [field, select] : metadata_selects) { - ret.push_back(fmt::format(R"({} AS "{}")", select, field)); + ret.reserve(database_config.schema.metadata.size()); + for (const auto& field : database_config.schema.metadata) { + ret.push_back(fmt::format(R"( "metadata"."{0}" AS "{0}")", field.name)); } return ret; } diff --git a/src/silo/preprocessing/metadata_info.test.cpp b/src/silo/preprocessing/metadata_info.test.cpp index db2070af2..e9e21c4e9 100644 --- a/src/silo/preprocessing/metadata_info.test.cpp +++ b/src/silo/preprocessing/metadata_info.test.cpp @@ -25,7 +25,7 @@ TEST( }; EXPECT_THROW( - silo::preprocessing::MetadataInfo::validateFromMetadataFile( + silo::preprocessing::MetadataInfo::validateMetadataFile( "testBaseData/exampleDataset/small_metadata_set.tsv", some_config_with_one_column_not_in_metadata ), @@ -50,10 +50,7 @@ TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::validateFromMetadataFile( - "testBaseData/exampleDataset/small_metadata_set.tsv", valid_config - ) - .getMetadataFields(); + const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("gisaid_epi_isl")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("pango_lineage")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("date")") != fields.end()); @@ -77,10 +74,7 @@ TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::validateFromNdjsonFile( - "testBaseData/exampleDatasetAsNdjson/input_file.ndjson", valid_config - ) - .getMetadataFields(); + const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("gisaid_epi_isl")") != fields.end()); ASSERT_TRUE(std::find(fields.begin(), fields.end(), R"("pango_lineage")") != fields.end()); diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index 1b531a476..e8518c99a 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -7,6 +7,7 @@ #include "silo/common/block_timer.h" #include "silo/common/fasta_reader.h" +#include "silo/common/string_utils.h" #include "silo/common/table_reader.h" #include "silo/config/preprocessing_config.h" #include "silo/database.h" @@ -38,7 +39,18 @@ Preprocessor::Preprocessor( database_config(std::move(database_config_)), preprocessing_db(preprocessing_config.getPreprocessingDatabaseLocation(), reference_genomes), reference_genomes_(reference_genomes), - alias_lookup_(std::move(alias_lookup)) {} + alias_lookup_(std::move(alias_lookup)) { + for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { + nuc_sequences.emplace_back(seq_name); + } + for (const auto& [seq_name, _] : reference_genomes_.raw_aa_sequences) { + aa_sequences.emplace_back(seq_name); + } + if (database_config.schema.date_to_sort_by.has_value()) { + order_by_fields.emplace_back(database_config.schema.date_to_sort_by.value()); + } + order_by_fields.emplace_back(database_config.schema.primary_key); +} Database Preprocessor::preprocess() { SPDLOG_INFO( @@ -78,12 +90,12 @@ Database Preprocessor::preprocess() { buildPartitioningTable(); SPDLOG_DEBUG("preprocessing - creating insertions tables for building SILO"); createInsertionsTableFromFile( - reference_genomes_.raw_nucleotide_sequences, + nuc_sequences, preprocessing_config.getNucleotideInsertionsFilename(), NUC_INSERTION_TABLE_NAME ); createInsertionsTableFromFile( - reference_genomes_.raw_aa_sequences, + aa_sequences, preprocessing_config.getAminoAcidInsertionsFilename(), AA_INSERTION_TABLE_NAME ); @@ -94,38 +106,40 @@ Database Preprocessor::preprocess() { const auto partition_descriptor = preprocessing_db.getPartitionDescriptor(); - std::string order_by_clause = database_config.schema.getStrictOrderByClause(); - SPDLOG_INFO("preprocessing - order by clause is {}", order_by_clause); - SPDLOG_INFO("preprocessing - building database"); preprocessing_db.refreshConnection(); return buildDatabase( - partition_descriptor, order_by_clause, preprocessing_config.getIntermediateResultsDirectory() + partition_descriptor, preprocessing_config.getIntermediateResultsDirectory() ); } void Preprocessor::buildTablesFromNdjsonInput(const std::filesystem::path& file_name) { + (void)preprocessing_db.query(fmt::format( + R"-( + CREATE OR REPLACE TABLE metadata_table({}); + )-", + boost::join(MetadataInfo::getMetadataTypes(database_config), ",") + )); + if (!std::filesystem::exists(file_name)) { throw silo::preprocessing::PreprocessingException( fmt::format("The specified input file {} does not exist.", file_name.string()) ); } + if (std::filesystem::is_empty(file_name)) { - throw silo::preprocessing::PreprocessingException( - fmt::format("The specified input file {} is empty.", file_name.string()) + SPDLOG_WARN( + "The specified input file {} is empty. Ignoring its content.", file_name.string() ); + return; } SPDLOG_DEBUG("build - validating metadata file '{}' with config", file_name.string()); - const auto metadata_info = MetadataInfo::validateFromNdjsonFile(file_name, database_config); + MetadataInfo::validateNdjsonFile(file_name, database_config); (void)preprocessing_db.query(fmt::format( - R"-( - CREATE OR REPLACE TABLE metadata_table AS - SELECT {} - FROM '{}'; - )-", - boost::join(metadata_info.getMetadataSelects(), ","), + "INSERT INTO metadata_table BY NAME (SELECT {} FROM read_json_auto('{}'));", + boost::join(MetadataInfo::getMetadataSelects(database_config), ","), file_name.string() )); @@ -151,14 +165,19 @@ void Preprocessor::buildTablesFromNdjsonInput(const std::filesystem::path& file_ } void Preprocessor::buildMetadataTableFromFile(const std::filesystem::path& metadata_filename) { - const MetadataInfo metadata_info = - MetadataInfo::validateFromMetadataFile(metadata_filename, database_config); + (void)preprocessing_db.query(fmt::format( + R"-( + CREATE OR REPLACE TABLE metadata_table({}); + )-", + boost::join(MetadataInfo::getMetadataTypes(database_config), ",") + )); + + MetadataInfo::validateMetadataFile(metadata_filename, database_config); (void)preprocessing_db.query(fmt::format( - "CREATE OR REPLACE TABLE metadata_table AS\n" - "SELECT {}\n" - "FROM '{}';", - boost::join(metadata_info.getMetadataSelects(), ","), + "INSERT INTO metadata_table BY NAME (SELECT {} FROM read_csv_auto('{}', delim = '\t', " + "header = true));", + boost::join(MetadataInfo::getMetadataFields(database_config), ","), metadata_filename.string() )); } @@ -262,7 +281,7 @@ FROM metadata_table; (void)preprocessing_db.query( "CREATE OR REPLACE TABLE partition_key_to_partition AS\n" - "SELECT 0::bigint AS partition_key, 0::bigint AS partition_id;" + "SELECT ''::VARCHAR AS partition_key, 0::bigint AS partition_id;" ); (void)preprocessing_db.query( @@ -274,114 +293,143 @@ FROM metadata_table; void Preprocessor::createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name ) { - const SequenceInfo sequence_info(reference_genomes_); - sequence_info.validate(preprocessing_db.getConnection(), file_name); - - std::string partition_by_select; - std::string partition_by_where; - if (database_config.schema.partition_by.has_value()) { - partition_by_select = "partition_key_to_partition.partition_id AS partition_id"; - partition_by_where = fmt::format( - "WHERE (metadata.\"{0}\" = partition_key_to_partition.partition_key) OR " - "(metadata.\"{0}\" IS NULL AND " - "partition_key_to_partition.partition_key IS NULL)", - database_config.schema.partition_by.value() - ); - } else { - partition_by_select = "0 AS partition_id"; - partition_by_where = ""; + if (std::filesystem::is_empty(file_name)) { } + SequenceInfo::validateNdjsonFile( + reference_genomes_, preprocessing_db.getConnection(), file_name + ); - createUnalignedPartitionedSequenceFiles(file_name, partition_by_select, partition_by_where); + createUnalignedPartitionedSequenceFiles(file_name); - createAlignedPartitionedSequenceViews( - file_name, sequence_info, partition_by_select, partition_by_where - ); + createAlignedPartitionedSequenceViews(file_name); } -void Preprocessor::createAlignedPartitionedSequenceViews( - const std::filesystem::path& file_name, - const SequenceInfo& sequence_info, - const std::string& partition_by_select, - const std::string& partition_by_where -) { - std::string order_by_select = ", metadata.\"" + database_config.schema.primary_key + "\" AS \"" + - database_config.schema.primary_key + "\""; - std::string order_by_fields = ", \"" + database_config.schema.primary_key + "\""; - if (database_config.schema.date_to_sort_by.has_value()) { - order_by_select += ", metadata.\"" + database_config.schema.date_to_sort_by.value() + - "\" AS \"" + database_config.schema.date_to_sort_by.value() + "\""; - order_by_fields += ", \"" + database_config.schema.date_to_sort_by.value() + "\""; +void Preprocessor::createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name) { + std::string file_reader_sql; + if (std::filesystem::is_empty(file_name)) { + file_reader_sql = fmt::format( + "SELECT ''::VARCHAR AS key, ''::VARCHAR AS partition_key, {}, {}, {}, {}, {}", + boost::join(silo::prepend("''::VARCHAR AS nuc_", nuc_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS aa_", aa_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS nuc_insertions_", nuc_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS aa_insertions_", aa_sequences), ", "), + boost::join(silo::prepend("''::VARCHAR AS order_by_field_", order_by_fields), ", ") + ); + } else { + file_reader_sql = fmt::format( + "SELECT metadata.\"{}\" AS key, {} AS partition_key, {}, {}, {}, {}, {} FROM " + "read_json_auto('{}')", + database_config.schema.primary_key, + database_config.schema.partition_by.has_value() + ? fmt::format("metadata.\"{}\"", database_config.schema.partition_by.value()) + : "''::VARCHAR", + boost::join( + silo::tie( + "alignedNucleotideSequences.\"", nuc_sequences, "\" AS nuc_", nuc_sequences, "" + ), + ", " + ), + boost::join( + silo::tie("alignedAminoAcidSequences.\"", aa_sequences, "\" AS aa_", aa_sequences, ""), + ", " + ), + boost::join( + silo::tie( + "nucleotideInsertions.\"", nuc_sequences, "\" AS nuc_insertions_", nuc_sequences, "" + ), + ", " + ), + boost::join( + silo::tie( + "aminoAcidInsertions.\"", aa_sequences, "\" AS aa_insertions_", aa_sequences, "" + ), + ", " + ), + boost::join( + silo::tie("metadata.\"", order_by_fields, "\" AS order_by_field_", order_by_fields, ""), + ", " + ), + file_name.string() + ); } (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE TABLE sequence_table AS\n" - "SELECT metadata.\"{}\" AS key, {}, nucleotideInsertions, aminoAcidInsertions, {} {} \n" - "FROM '{}', partition_key_to_partition " - "{};", - database_config.schema.primary_key, - boost::join(sequence_info.getAlignedSequenceSelects(preprocessing_db), ","), - partition_by_select, - order_by_select, - file_name.string(), - partition_by_where + "SELECT key, partition_key_to_partition.partition_id AS partition_id, {}, {}, {}, {} \n" + "FROM ({}) file_reader " + "JOIN partition_key_to_partition " + "ON (file_reader.partition_key = partition_key_to_partition.partition_key);", + boost::join( + SequenceInfo::getAlignedSequenceSelects(reference_genomes_, preprocessing_db), ", " + ), + boost::join(prepend("nuc_insertions_", nuc_sequences), ", "), + boost::join(prepend("aa_insertions_", aa_sequences), ", "), + boost::join(prepend("order_by_field_", order_by_fields), ", "), + file_reader_sql )); (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {} AS\n" - "SELECT key, partition_id, nucleotideInsertions.* {} \n" - "FROM sequence_table", + "SELECT key, partition_id, {}, {} \n" + "FROM sequence_table;", NUC_INSERTION_TABLE_NAME, - order_by_fields + boost::join(silo::tie("nuc_insertions_", nuc_sequences, " AS ", nuc_sequences, " "), ","), + boost::join(prepend("order_by_field_", order_by_fields), ",") )); (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {} AS\n" - "SELECT key, partition_id, aminoAcidInsertions.* {} \n" - "FROM sequence_table", + "SELECT key, partition_id, {}, {} \n" + "FROM sequence_table;", AA_INSERTION_TABLE_NAME, - order_by_fields + boost::join(silo::tie("aa_insertions_", aa_sequences, " AS ", aa_sequences, " "), ","), + boost::join(prepend("order_by_field_", order_by_fields), ",") )); for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW nuc_{0} AS\n" - "SELECT key, nuc_{0} AS sequence, partition_id" - "{1}" + "SELECT key, nuc_{0} AS sequence, partition_id, {1} " "FROM sequence_table;", seq_name, - order_by_fields + boost::join(prepend("order_by_field_", order_by_fields), ",") )); } for (const auto& [seq_name, _] : reference_genomes_.raw_aa_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW gene_{0} AS\n" - "SELECT key, gene_{0} AS sequence, partition_id" - "{1}" + "SELECT key, gene_{0} AS sequence, partition_id, {1} " "FROM sequence_table;", seq_name, - order_by_fields + boost::join(prepend("order_by_field_", order_by_fields), ",") )); } } -void Preprocessor::createUnalignedPartitionedSequenceFiles( - const std::filesystem::path& file_name, - const std::string& partition_by_select, - const std::string& partition_by_where -) { +void Preprocessor::createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name) { for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { + const std::string file_reader_sql = + std::filesystem::is_empty(file_name) + ? fmt::format("SELECT ''::VARCHAR AS key, ''::VARCHAR AS unaligned_nuc_{}", seq_name) + : fmt::format( + "SELECT metadata.\"{0}\" AS key, {1} AS partition_key, " + " unalignedNucleotideSequences.\"{2}\" AS unaligned_nuc_{2} " + "FROM read_json_auto('{3}')", + database_config.schema.primary_key, + database_config.schema.partition_by.has_value() + ? fmt::format("metadata.\"{}\"", database_config.schema.partition_by.value()) + : "''::VARCHAR", + seq_name, + file_name.string() + ); const std::string table_sql = fmt::format( - "SELECT metadata.\"{}\" AS key, {}," - "{} \n" - "FROM '{}', partition_key_to_partition " - "{}", - database_config.schema.primary_key, + "SELECT key, {}, partition_key_to_partition.partition_id \n" + "FROM ({}) file_reader " + "JOIN partition_key_to_partition " + "ON (file_reader.partition_key = partition_key_to_partition.partition_key) ", SequenceInfo::getUnalignedSequenceSelect(seq_name, preprocessing_db), - partition_by_select, - file_name.string(), - partition_by_where + file_reader_sql ); createUnalignedPartitionedSequenceFile(seq_name, table_sql); } @@ -403,12 +451,12 @@ void Preprocessor::createUnalignedPartitionedSequenceFile( } void Preprocessor::createInsertionsTableFromFile( - const std::map& expected_sequences, + const std::vector& expected_sequences, const std::filesystem::path& insertion_file, const std::string& table_name ) { std::set expected_sequence_columns; - for (const auto& [sequence_name, _] : expected_sequences) { + for (const auto& sequence_name : expected_sequences) { expected_sequence_columns.emplace(sequence_name); } @@ -442,14 +490,25 @@ void Preprocessor::createInsertionsTableFromFile( } (void)preprocessing_db.query(fmt::format( - "CREATE TABLE {0} as " - "(SELECT ins.\"{1}\" as key, * " - "FROM read_csv('{2}', delim = '\t', header = true," - " columns = {{{3}}}) ins, " + "CREATE OR REPLACE TABLE {0} AS " + "(SELECT ins.\"{1}\" AS key, {2}, {3}, partition_id " + "FROM read_csv('{4}', delim = '\t', header = true," + " columns = {{{5}}}) ins, " "partitioned_metadata " "WHERE ins.\"{1}\" == partitioned_metadata.\"{1}\" );", table_name, database_config.schema.primary_key, + boost::join(silo::tie("ins.\"", expected_sequences, "\" AS ", expected_sequences, " "), ","), + boost::join( + silo::tie( + "partitioned_metadata.\"", + order_by_fields, + "\" AS order_by_field_", + order_by_fields, + " " + ), + "," + ), insertion_file.string(), boost::join(column_structs, ",") )); @@ -503,13 +562,6 @@ void Preprocessor::createPartitionedTableForSequence( const std::filesystem::path& filename, const std::string& table_prefix ) { - std::string order_by_select = ", raw.key AS " + database_config.schema.primary_key; - if (database_config.schema.date_to_sort_by.has_value()) { - order_by_select += ", partitioned_metadata." + - database_config.schema.date_to_sort_by.value() + " AS " + - database_config.schema.date_to_sort_by.value(); - } - const std::string raw_table_name = "raw_" + table_prefix + sequence_name; const std::string table_name = table_prefix + sequence_name; @@ -519,13 +571,17 @@ void Preprocessor::createPartitionedTableForSequence( R"-( CREATE OR REPLACE VIEW {} AS SELECT key, sequence, - partitioned_metadata.partition_id AS partition_id - {} + partitioned_metadata.partition_id AS partition_id, {} FROM {} AS raw RIGHT JOIN partitioned_metadata ON raw.key = partitioned_metadata."{}"; )-", table_name, - order_by_select, + boost::join( + silo::tie( + "partitioned_metadata.", order_by_fields, " AS order_by_field_", order_by_fields, " " + ), + "," + ), raw_table_name, database_config.schema.primary_key )); @@ -533,7 +589,6 @@ void Preprocessor::createPartitionedTableForSequence( Database Preprocessor::buildDatabase( const preprocessing::Partitions& partition_descriptor, - const std::string& order_by_clause, const std::filesystem::path& intermediate_results_directory ) { Database database; @@ -559,18 +614,28 @@ Database Preprocessor::buildDatabase( tasks.run([&]() { SPDLOG_INFO("build - building metadata store in parallel"); - buildMetadataStore(database, partition_descriptor, order_by_clause); + buildMetadataStore( + database, partition_descriptor, "ORDER BY " + boost::join(order_by_fields, ",") + ); SPDLOG_INFO("build - finished metadata store"); }); tasks.run([&]() { SPDLOG_INFO("build - building nucleotide sequence stores"); - buildNucleotideSequenceStore(database, partition_descriptor, order_by_clause); + buildNucleotideSequenceStore( + database, + partition_descriptor, + "ORDER BY " + boost::join(prepend("order_by_field_", order_by_fields), ",") + ); SPDLOG_INFO("build - finished nucleotide sequence stores"); SPDLOG_INFO("build - building amino acid sequence stores"); - buildAminoAcidSequenceStore(database, partition_descriptor, order_by_clause); + buildAminoAcidSequenceStore( + database, + partition_descriptor, + "ORDER BY " + boost::join(prepend("order_by_field_", order_by_fields), ",") + ); SPDLOG_INFO("build - finished amino acid sequence stores"); }); diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 67e729f1b..4c262122a 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -13,14 +13,14 @@ namespace { struct Scenario { - std::string input_directory; + std::filesystem::path input_directory; uint expected_sequence_count; std::string query; nlohmann::json expected_query_result; }; std::string printTestName(const ::testing::TestParamInfo& info) { - std::string name = "Dir_" + info.param.input_directory; + std::string name = "Dir_" + info.param.input_directory.string(); std::replace(name.begin(), name.end(), '/', '_'); return name; } @@ -91,8 +91,8 @@ const Scenario TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD = { }; const Scenario EMPTY_INPUT_TSV = { - .input_directory = "testBaseData/exampleDatasetEmpty/", - .expected_sequence_count = 2, + .input_directory = "testBaseData/emptyInputTsv/", + .expected_sequence_count = 0, .query = R"( { "action": { @@ -108,8 +108,8 @@ const Scenario EMPTY_INPUT_TSV = { }; const Scenario EMPTY_INPUT_NDJSON = { - .input_directory = "testBaseData/exampleDatasetEmpty/", - .expected_sequence_count = 2, + .input_directory = "testBaseData/emptyInputNdjson/", + .expected_sequence_count = 0, .query = R"( { "action": { @@ -140,15 +140,15 @@ INSTANTIATE_TEST_SUITE_P( printTestName ); -TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) { +TEST_P(PreprocessorTestFixture, shouldProcessData) { const auto scenario = GetParam(); + silo::config::PreprocessingConfig config{.input_directory = scenario.input_directory}; - silo::config::PreprocessingConfig config; - config.overwrite(silo::config::YamlConfig(scenario.input_directory + "preprocessing_config.yaml") + config.overwrite(silo::config::YamlConfig(scenario.input_directory / "preprocessing_config.yaml") ); const auto database_config = silo::config::ConfigRepository().getValidatedConfig( - scenario.input_directory + "database_config.yaml" + scenario.input_directory / "database_config.yaml" ); const auto reference_genomes = @@ -163,7 +163,9 @@ TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) { const auto database_info = database.getDatabaseInfo(); - EXPECT_GT(database_info.total_size, 0UL); + if (scenario.expected_sequence_count > 0) { + EXPECT_GT(database_info.total_size, 0UL); + } EXPECT_EQ(database_info.sequence_count, scenario.expected_sequence_count); const silo::query_engine::QueryEngine query_engine(database); diff --git a/src/silo/preprocessing/sequence_info.cpp b/src/silo/preprocessing/sequence_info.cpp index 8200305a7..2620e45d0 100644 --- a/src/silo/preprocessing/sequence_info.cpp +++ b/src/silo/preprocessing/sequence_info.cpp @@ -9,24 +9,18 @@ namespace silo::preprocessing { -SequenceInfo::SequenceInfo(const silo::ReferenceGenomes& reference_genomes) { - for (const auto& [name, sequence] : reference_genomes.raw_nucleotide_sequences) { - nuc_sequence_names.push_back(name); - } - for (const auto& [name, sequence] : reference_genomes.raw_aa_sequences) { - aa_sequence_names.push_back(name); - } -} - std::vector SequenceInfo::getAlignedSequenceSelects( + const silo::ReferenceGenomes& reference_genomes, const PreprocessingDatabase& preprocessing_db -) const { +) { std::vector sequence_selects; - sequence_selects.reserve(nuc_sequence_names.size() + aa_sequence_names.size()); - for (const std::string& name : nuc_sequence_names) { + sequence_selects.reserve( + reference_genomes.nucleotide_sequences.size() + reference_genomes.aa_sequences.size() + ); + for (const auto& [name, _] : reference_genomes.nucleotide_sequences) { sequence_selects.emplace_back(getNucleotideSequenceSelect(name, preprocessing_db)); } - for (const std::string& name : aa_sequence_names) { + for (const auto& [name, _] : reference_genomes.aa_sequences) { sequence_selects.emplace_back(getAminoAcidSequenceSelect(name, preprocessing_db)); } return sequence_selects; @@ -36,8 +30,7 @@ std::string SequenceInfo::getNucleotideSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("alignedNucleotideSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("nuc_{}", seq_name); return fmt::format( "{0} AS nuc_{1}", @@ -52,8 +45,7 @@ std::string SequenceInfo::getUnalignedSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("unalignedNucleotideSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("unaligned_nuc_{}", seq_name); return fmt::format( "{0} AS unaligned_nuc_{1}", preprocessing_db.compress_nucleotide_functions.at(seq_name)->generateSqlStatement( @@ -67,8 +59,7 @@ std::string SequenceInfo::getAminoAcidSequenceSelect( std::string_view seq_name, const PreprocessingDatabase& preprocessing_db ) { - const std::string column_name_in_data = - fmt::format("alignedAminoAcidSequences.\"{}\"", seq_name); + const std::string column_name_in_data = fmt::format("aa_{}", seq_name); return fmt::format( "{0} AS gene_{1}", @@ -110,10 +101,29 @@ void validateStruct( } } -void SequenceInfo::validate( +void SequenceInfo::validateNdjsonFile( + const silo::ReferenceGenomes& reference_genomes, duckdb::Connection& connection, const std::filesystem::path& input_filename -) const { +) { + if (std::filesystem::is_empty(input_filename)) { + return; + } + + std::vector nuc_sequence_names; + std::transform( + reference_genomes.raw_nucleotide_sequences.begin(), + reference_genomes.raw_nucleotide_sequences.end(), + std::back_inserter(nuc_sequence_names), + [](auto& pair) { return pair.first; } + ); + std::vector aa_sequence_names; + std::transform( + reference_genomes.raw_aa_sequences.begin(), + reference_genomes.raw_aa_sequences.end(), + std::back_inserter(aa_sequence_names), + [](auto& pair) { return pair.first; } + ); auto result = connection.Query(fmt::format( "SELECT json_keys(alignedNucleotideSequences), json_keys(alignedAminoAcidSequences), " "json_keys(unalignedNucleotideSequences), json_keys(nucleotideInsertions), " diff --git a/src/silo/preprocessing/sequence_info.test.cpp b/src/silo/preprocessing/sequence_info.test.cpp index 54874842b..a484d56f1 100644 --- a/src/silo/preprocessing/sequence_info.test.cpp +++ b/src/silo/preprocessing/sequence_info.test.cpp @@ -13,25 +13,23 @@ TEST(SequenceInfo, validatesSuccessfulOnCorrectFile) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); - ASSERT_NO_THROW(sequence_info.validate( - connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" + ASSERT_NO_THROW(SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" )); } TEST(SequenceInfo, failWhenTooManyGenomesInReferences) { const auto reference_genomes = ReferenceGenomes::readFromFile("testBaseData/exampleDataset/reference_genomes.json"); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate( - connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/exampleDataset1000Sequences/sample.ndjson.zst" ), silo::preprocessing::PreprocessingException ); @@ -41,12 +39,13 @@ TEST(SequenceInfo, failWhenTooManyGenomesInJson) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate(connection, "testBaseData/ndjsonFiles/oneline_second_nuc.json.zst"), + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/ndjsonFiles/oneline_second_nuc.json.zst" + ), silo::preprocessing::PreprocessingException ); } @@ -55,12 +54,13 @@ TEST(SequenceInfo, failWhenTooFewAASequencesInJson) { const auto reference_genomes = ReferenceGenomes::readFromFile( "testBaseData/exampleDataset1000Sequences/reference_genomes.json" ); - const SequenceInfo sequence_info(reference_genomes); duckdb::DuckDB duckdb; duckdb::Connection connection(duckdb); ASSERT_THROW( - sequence_info.validate(connection, "testBaseData/ndjsonFiles/oneline_without_ORF.json.zst"), + SequenceInfo::validateNdjsonFile( + reference_genomes, connection, "testBaseData/ndjsonFiles/oneline_without_ORF.json.zst" + ), silo::preprocessing::PreprocessingException ); } \ No newline at end of file diff --git a/testBaseData/emptyInputNdjson/aa_insertions.tsv b/testBaseData/emptyInputNdjson/aa_insertions.tsv new file mode 100644 index 000000000..f1ef77a70 --- /dev/null +++ b/testBaseData/emptyInputNdjson/aa_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl S ORF1a E M N ORF1b ORF3a ORF6 ORF7a ORF7b ORF8 ORF9b diff --git a/testBaseData/emptyInputNdjson/database_config.yaml b/testBaseData/emptyInputNdjson/database_config.yaml index 1da270b08..a3e0b45dd 100644 --- a/testBaseData/emptyInputNdjson/database_config.yaml +++ b/testBaseData/emptyInputNdjson/database_config.yaml @@ -22,10 +22,6 @@ schema: type: int - name: qc_value type: float - - name: nucleotideInsertions - type: insertion - - name: aminoAcidInsertions - type: aaInsertion primaryKey: gisaid_epi_isl dateToSortBy: date partitionBy: pango_lineage \ No newline at end of file diff --git a/testBaseData/emptyInputNdjson/nuc_insertions.tsv b/testBaseData/emptyInputNdjson/nuc_insertions.tsv new file mode 100644 index 000000000..59063c0ea --- /dev/null +++ b/testBaseData/emptyInputNdjson/nuc_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl main testSecondSequence diff --git a/testBaseData/emptyInputTsv/aa_insertions.tsv b/testBaseData/emptyInputTsv/aa_insertions.tsv new file mode 100644 index 000000000..f1ef77a70 --- /dev/null +++ b/testBaseData/emptyInputTsv/aa_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl S ORF1a E M N ORF1b ORF3a ORF6 ORF7a ORF7b ORF8 ORF9b diff --git a/testBaseData/emptyInputTsv/database_config.yaml b/testBaseData/emptyInputTsv/database_config.yaml index 1da270b08..a3e0b45dd 100644 --- a/testBaseData/emptyInputTsv/database_config.yaml +++ b/testBaseData/emptyInputTsv/database_config.yaml @@ -22,10 +22,6 @@ schema: type: int - name: qc_value type: float - - name: nucleotideInsertions - type: insertion - - name: aminoAcidInsertions - type: aaInsertion primaryKey: gisaid_epi_isl dateToSortBy: date partitionBy: pango_lineage \ No newline at end of file diff --git a/testBaseData/emptyInputTsv/nuc_insertions.tsv b/testBaseData/emptyInputTsv/nuc_insertions.tsv new file mode 100644 index 000000000..59063c0ea --- /dev/null +++ b/testBaseData/emptyInputTsv/nuc_insertions.tsv @@ -0,0 +1 @@ +gisaid_epi_isl main testSecondSequence diff --git a/testBaseData/emptyInputTsv/small_metadata_set.tsv b/testBaseData/emptyInputTsv/small_metadata_set.tsv index d55907bf5..da32055fa 100644 --- a/testBaseData/emptyInputTsv/small_metadata_set.tsv +++ b/testBaseData/emptyInputTsv/small_metadata_set.tsv @@ -1,101 +1 @@ gisaid_epi_isl pango_lineage date region country division unsorted_date age qc_value nucleotideInsertions aminoAcidInsertions -EPI_ISL_1408408 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 S:214:EPE -EPI_ISL_1749899 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 -EPI_ISL_2016901 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 -EPI_ISL_1749892 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 -EPI_ISL_1597932 B.1.1.7 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 S:214:EPE -EPI_ISL_1407962 B.1.1.7 Europe Switzerland Solothurn 2021-01-16 55 0.93 -EPI_ISL_1750503 B.1.258.17 2020-12-24 Europe Switzerland Zürich 2021-02-14 56 0.92 -EPI_ISL_1360935 B.1.1.7 2021-03-08 Europe Switzerland Jura 2021-01-03 57 0.91 -EPI_ISL_2019235 B.1.1.7 2021-04-28 Europe Switzerland Basel-Stadt 2021-01-22 58 0.90 -EPI_ISL_1749960 B.1.1.7 2021-04-15 Europe Switzerland Basel-Land 2021-02-03 59 0.89 -EPI_ISL_1361468 B.1.1.7 2021-03-06 Europe Switzerland Zürich 2021-01-20 50 0.98 -EPI_ISL_1408062 B.1.1.7 2021-03-03 Europe Switzerland Valais 2020-11-24 50 0.97 22204:CAGAA -EPI_ISL_1597890 B.1.1.7 2021-03-21 Switzerland Vaud 2021-01-25 51 0.96 22339:GCTGGT -EPI_ISL_1682849 XA.1 2020-12-17 Europe Switzerland Thurgau 2021-01-21 52 0.95 -EPI_ISL_1408805 B.1.221 2020-11-24 Europe Switzerland Schwyz 2020-12-09 53 0.94 -EPI_ISL_1750868 B.1.1.189 2020-12-15 Europe Switzerland Solothurn 2021-01-20 54 0.93 S:214:EPE -EPI_ISL_2019350 B.1.1.7 2021-04-27 Europe Switzerland Valais 2020-12-21 55 0.92 -EPI_ISL_2017036 B.1.1.7 2021-04-23 Europe Switzerland Solothurn 2021-03-09 56 0.91 -EPI_ISL_1599113 B.1.1.39 2020-12-08 Europe Switzerland Zürich 2021-03-05 57 0.90 -EPI_ISL_2214128 B.1.1.7 2021-05-10 Europe Switzerland Geneva 2020-11-13 58 0.89 -EPI_ISL_2408472 B.1.1.7 2021-05-25 Europe Switzerland Obwalden 2021-03-02 59 0.98 -EPI_ISL_830864 B.1.177 2020-10-08 Europe Switzerland Basel-Stadt 2021-03-03 50 0.97 -EPI_ISL_581968 B.1.160 2020-08-17 Europe Switzerland Basel-Stadt 2021-03-25 50 0.96 S:214:EPE -EPI_ISL_2213804 Q.7 2021-05-08 Europe Switzerland Geneva 2021-04-12 51 25701:CCC -EPI_ISL_2405276 B.1.1.7 2021-05-24 Europe Switzerland Vaud 2021-04-28 52 0.94 -EPI_ISL_2213934 B.1.1.7 2021-05-13 Europe Switzerland Geneva 2021-04-23 53 0.93 -EPI_ISL_2213984 B.1.1.7 2021-05-08 Europe Switzerland Geneva 2021-05-09 54 0.92 25701:CCC -EPI_ISL_2574088 B.1.1.7 2021-06-10 Europe Switzerland Sankt Gallen 2021-05-05 55 0.91 25701:CCC -EPI_ISL_2544226 B.1.1.7 2021-06-05 Europe Switzerland Ticino 2021-05-12 56 0.90 -EPI_ISL_2360326 Q.7 2021-05-23 Europe Switzerland Ticino 2021-03-10 57 0.89 -EPI_ISL_2379651 B.1.1.7 2021-05-11 Europe Switzerland Valais 2021-06-01 58 0.98 -EPI_ISL_1036103 B.1.258 2020-12-09 Europe Switzerland Aargau 2021-06-03 59 0.97 -EPI_ISL_931279 B.1.1 2020-10-28 Europe Switzerland Basel-Stadt 2021-05-11 50 0.96 -EPI_ISL_931031 B.1.177 2020-10-22 Europe Switzerland Basel-Stadt 2021-05-10 50 0.95 -EPI_ISL_1273458 B.1.1.7 2021-01-26 Europe Switzerland Basel-Land 2021-05-18 51 0.94 25701:CCC -EPI_ISL_1273715 B.1.160 2021-01-20 Europe Switzerland Basel-Stadt 2021-05-08 52 0.93 -EPI_ISL_737604 B.1.1 2020-12-14 Europe Switzerland Bern 2021-05-14 53 0.92 -EPI_ISL_1129663 B.1.1.7 2020-12-29 Europe Switzerland Bern 2021-05-07 54 0.91 -EPI_ISL_1003629 B.1.1.39 2021-01-25 Europe Switzerland Aargau 2021-05-18 55 0.90 S:214:EPE -EPI_ISL_737715 B.1.177 2020-12-13 Europe Switzerland Bern 2021-05-16 56 0.89 S:247:SGE -EPI_ISL_1003036 B.1.177 2021-01-16 Europe Switzerland Aargau 2021-07-14 57 0.98 5959:TAT -EPI_ISL_899762 B.1.177 2020-12-25 Europe Switzerland Schwyz 2021-07-19 58 0.97 -EPI_ISL_899725 B.1.177 2021-01-12 Europe Switzerland Solothurn 2021-07-14 59 0.96 S:210:IV -EPI_ISL_1195052 B.1.1.7 2021-02-23 Europe Switzerland Solothurn 2021-07-04 50 0.95 -EPI_ISL_1003519 B.1.160.16 2021-01-22 Europe Switzerland 2021-07-29 50 0.94 -EPI_ISL_1003010 B.1.36.35 2021-01-15 Europe Switzerland Solothurn 2021-07-19 51 0.93 -EPI_ISL_1119584 B.1.1 2020-11-04 Europe Switzerland Solothurn 2021-07-05 52 0.92 -EPI_ISL_1002052 B.1 2021-01-15 Europe Switzerland Solothurn 2021-07-15 53 0.91 -EPI_ISL_466942 B.1 2020-03-08 Europe Switzerland Basel-Stadt 2021-05-12 54 0.90 -EPI_ISL_1003849 B.1.160 2021-01-29 Europe Switzerland Neuchâtel 2021-08-05 55 0.89 -EPI_ISL_768148 GD.1 2020-12-24 Europe Switzerland Sankt Gallen 2020-03-16 56 0.98 25701:CCC -EPI_ISL_1080536 B.1.1.7 2021-02-10 Europe Switzerland Basel-Land 2021-08-04 57 0.97 -EPI_ISL_1002156 B.1.221 2021-01-16 Europe Switzerland Basel-Land 2021-02-03 58 0.96 -EPI_ISL_1119315 B.1.1.7 2021-02-14 Europe Switzerland Graubünden 2021-03-18 59 0.95 -EPI_ISL_1004495 B.1.177.44 2021-01-03 Europe Switzerland 2021-04-13 50 0.94 25701:CCC -EPI_ISL_1001920 B.1.177 2021-01-22 Europe Switzerland Bern 2021-04-25 50 0.93 -EPI_ISL_1131102 B.1.160 2021-02-03 Europe Switzerland Zürich 2021-04-13 51 0.92 -EPI_ISL_1003373 B.1.177 2021-01-20 Europe Switzerland Zürich 2021-03-19 52 0.91 -EPI_ISL_721941 B.1.1.70 2020-11-24 Europe Switzerland Zürich 2021-03-15 53 0.90 -EPI_ISL_1130868 B.1.525 2021-01-25 Europe Switzerland Zürich 2020-12-24 54 0.89 25701:CCC -EPI_ISL_1003425 B.1.177 2021-01-21 Europe Switzerland Uri 2021-03-08 55 0.98 -EPI_ISL_737860 B.1.160 2020-12-09 Europe Switzerland Valais 2021-04-28 56 0.97 -EPI_ISL_1001493 B.1.177.44 2021-01-20 Europe Switzerland Vaud 2021-04-15 57 0.96 -EPI_ISL_1260480 B.1.160 2020-12-21 Europe Switzerland Zürich 2021-03-06 58 0.95 -EPI_ISL_1747885 B.1.1.7 2021-03-09 Europe Switzerland Solothurn 2021-03-03 59 0.94 -EPI_ISL_1747752 B.1.1.7 2021-03-05 Europe Switzerland Basel-Land 2021-03-21 50 0.93 -EPI_ISL_1005148 B.1.221 2020-11-13 Europe Switzerland Solothurn 2020-12-17 50 0.92 25701:CCC -EPI_ISL_1748243 B.1.1.7 2021-03-02 Europe Switzerland Solothurn 2020-11-24 0.91 -EPI_ISL_1748215 B.1.1.7 2021-03-03 Europe Switzerland Solothurn 2020-12-15 52 0.90 -EPI_ISL_1748395 B.1.1.7 2021-03-25 Europe Switzerland Basel-Stadt 2021-04-27 53 0.89 -EPI_ISL_1760534 B.1.1.7 2021-04-12 Europe Switzerland Ticino 2021-04-23 54 0.98 -EPI_ISL_2086867 C.36.3 2021-04-28 Europe Switzerland Zürich 2020-12-08 55 0.97 25701:CCC -EPI_ISL_1840634 Q.7 2021-04-23 Europe Switzerland Ticino 2021-05-10 56 0.96 -EPI_ISL_2180995 B.1.1.7 2021-05-09 Europe Switzerland Basel-Stadt 2021-05-25 57 0.95 -EPI_ISL_2181005 B.1.1.7 2021-05-05 Europe Switzerland Basel-Stadt 2020-10-08 58 0.94 -EPI_ISL_2180023 B.1.1.7 2021-05-12 Europe Switzerland Ticino 2020-08-17 59 0.93 25701:CCC -EPI_ISL_2270139 B.1.1.7 2021-03-10 Europe Switzerland Basel-Stadt 2021-05-08 50 0.92 -EPI_ISL_2544452 B.1.1.7 2021-06-01 Europe Switzerland Schwyz 2021-05-24 50 0.91 -EPI_ISL_2544332 B.1.1.7 2021-06-03 Europe Switzerland Bern 2021-05-13 51 0.90 25701:CCC -EPI_ISL_2307766 B.1.1.7 2021-05-11 Europe Switzerland Bern 2021-05-08 52 0.89 -EPI_ISL_2375490 B.1.1.7 2021-05-10 Europe Switzerland Valais 2021-06-10 53 0.98 -EPI_ISL_2374969 B.1.1.7 2021-05-18 Europe Switzerland Aargau 2021-06-05 54 0.97 25701:CCC -EPI_ISL_2307888 B.1.1.7 2021-05-08 Europe Switzerland Solothurn 2021-05-23 55 0.96 -EPI_ISL_2375247 B.1.1.7 2021-05-14 Europe Switzerland Sankt Gallen 2021-05-11 56 25701:CCC -EPI_ISL_2308054 B.1.1.7 2021-05-07 Europe Switzerland Zürich 2020-12-09 57 0.94 -EPI_ISL_2375165 B.1.1.7 2021-05-18 Europe Switzerland Basel-Land 2020-10-28 58 0.93 -EPI_ISL_2375097 B.1.1.7 2021-05-16 Europe Switzerland Basel-Land 2020-10-22 59 0.92 -EPI_ISL_3128737 AY.9.2 2021-07-14 Europe Switzerland Zürich 2021-01-26 50 0.91 -EPI_ISL_3128811 B.1.617.2 2021-07-19 Europe Switzerland Aargau 2021-01-20 50 0.90 -EPI_ISL_3086369 AY.122 2021-07-14 Europe Switzerland Ticino 2020-12-14 51 0.89 25701:CCC -EPI_ISL_3259931 AY.43 2021-07-04 Europe Switzerland Vaud 2020-12-29 52 0.98 S:143:T -EPI_ISL_3267832 AY.43 2021-07-29 Europe Switzerland Bern 2021-01-25 53 0.97 -EPI_ISL_3128796 B.1.617.2 2021-07-19 Europe Switzerland Zürich 2020-12-13 54 0.96 25701:CCC -EPI_ISL_3016465 B.1.1.7 2021-07-05 Europe Switzerland Valais 2021-01-16 0.95 -EPI_ISL_3247294 2021-07-15 Europe Switzerland Basel-Stadt 2020-12-25 56 0.94 -EPI_ISL_3578231 P.1 2021-05-12 Europe Switzerland Zürich 2021-01-12 57 0.93 25701:CCC -EPI_ISL_3465732 AY.43 2021-08-05 Europe Switzerland Vaud 2021-02-23 58 0.92 -EPI_ISL_2367431 B.1 2020-03-16 Europe Switzerland Vaud 2021-01-22 59 0.91 -EPI_ISL_3465556 AY.43 2021-08-04 Europe Switzerland Solothurn 2021-01-15 50 0.90 -EPI_ISL_2359636 B.1.1.189 2021-02-03 Europe Switzerland Vaud 2020-11-04 57 0.89 25701:CCC ORF1a:3602:F diff --git a/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst b/testBaseData/ndjsonFiles/oneline_second_nuc.json.zst index 6df5de35cb4a340059e16fe45259152671042457..f1a3f8aed7a4f6b9069f65dc04f217c16f35d636 100644 GIT binary patch delta 5912 zcmV+z7w72XPT)=oD77#Bq=Xv*0EMv%Mi2pxlTi>pe>9HrA3DQ@h=p4mzwT*Krtc=R zTL9sa+hS-zmNAL_l#}M|4*T%4(VQ;O>o4i~Kbry$Eh`X3B$I1}6V?^p;04g-(wx6f1T>;4XBWD6rkoR@6$>U`+5Z||ZjjEdAh3ikM z4Q;y^zTZ_84^nj?D20TeV!oT;*mn&ne`a4VRS5{YXcldDJRpIQQ{cTwo@3avX0&ud z5>enuD++!y9r6u#T41HRI`q*K*3bqj##{e^YIP6K4*<;smI)0%>K-R=365JcCD!Mn zeoG$8C085NHmNekT6Gz;>r_rF2rq$3tdJZaE9cpJH`^IZsz`*FsnMUNjYf7lA% zZmMT>Dac7VC8bFv0(XOHnP0cw^iMEa9twLWiWc;TC!ZaZPm zu0;td~8hP|{1+EZlB;4Y z(uRx^3pE!8*}5P0^n82j-67|St_8D9)cZ#Z3v1C`$(;k0g%13z*YNOr8W=FuFww{5 z2+)N5yl4_5j+E8znyLrB=}Djem`YiS+N!0;&Akd6)Ih;=kA_UZPnRSHe_T8-D70a{ z{F}`b!uZiZjh8Ut#PxJ@2a0GE+rRSiEe(v7iEHkK671B!x`ApYd%a&XDy-;dH#`OR zOY8ewK{K=E+XbA0Z=CoG;Z;2R#>wvIs)EmmiZG%9H%2%8-^@q+`l>x6QAk;)jw5We$vB7P`f4YhyBrSRxwr@@0 zvf!j;k**VwjBqRDBlN^Gu{R{0%5e#nttQRx#ikYz+?_NZdCx2GfzV*(T@Uey4HOJQ z-9u=Q591&@;DCULdU_q5i%%lqTl++FU;4|*mT(p(_2WSnw_H$R$u)X-aDGi(D43_C z$;$Lmxb6RXcM3{+f7M5E&Vn6z#TjQkb5zRk1|E`jWrTH~;eYvdaymu}8qcv@`h2v8 z=E~e^+sq@h4m9ZXE?nYWPj#tH!S68xm9BTh7}5bP7LjfKtI>oVwvg;#C`|NZa0ninSf8OZLx#F-?J>HwU!j=#o zT_{;}F=4>dfFU6$Nm5$PVbQAy-+=DnDkD$|v{6=uZV?%uWD%O&^WlpcK z3<)TI_anNZWpbExyXuP{C0ba$!4oL1{d@tt>ViUyB;HUn(zp{G4HLiZ*v!R6r+y_O zHjV{f`YPY}IM2u!a=;j2*B8q0CF7I@Nc_kwvI5vdf0h(ij!#lKSVpi?0gA@3oTrBz2Km!3;Aui#6@N-gzZEJTVx7bG0~`w z8dF=bf7_&q+k?QWuTD?EcA}nr6+il{Lw-mcwaJPsIE>WS`LP;7s@_^D`vYPBg3@7a zB<*^aKsJm|`pa;_SKD8%0cS%;XnW*i$R46HL_+pTmA+uUVnnN03EpCpksBIkBtB9# zfRup75F*D0th&aSn=gR=jQWUFt7uPfC)!RUe<4)rn8e@0ZSt?dJtEpVWh|B`95M?q zZrkjVJRk%k+EB${9YAf&6uuWASxW~=$lKWn-SN8KL}{9Zz( zT{z>u9s_@y)B6n&Si5LSpJg^NquCJ*AF0IFrdQPKJV8!FR;W;=XWdG~ ze}g@~G;-#Bkc4_OYJG4`-d2{3d5$u)D8FmNIrGQQv;x|bt$4s~(R85~P}+b*cZG+h zr>Y8moNi41SPqmyYCn)_h|x`S)CUI; zK}+0r*&T^wiVX~zuTzq>gvmUzk;h5Re~_?5aILb^FxT4K4>;u*xxK!PP%cYIU+m)oWY!Mz_=qcD*0r5f4_Uu zQpkPo(Von;xJ>H*&SrIYZ?ca(Z=<7yyH$1B0E5<6qC19AL@gUvEB50@R(jjg9npas z&Lb9`buS$2V5%<*BMmK#o8V)49lDJ*KKxt=4#joM6vUO8AqXQSwGoD+xr-@brkV)|OWDluB(bii6>e@xVvh73 zH(Ciq;TkK&?L-=k!hW#SOFjb^oLj8+^2fq3-HZaFlu71Z*;3W2?1S%Oa9XIzeq_Qc zT*$+(fU*qX)d3&W5mn_xAk4aN>pA`%2?!(*#{Ycpj)%;lF;4EkR_l0Gf0BOn!NZD7 z_ali$_R0Vjjc79IZz7@d8S68rC(o!b5?YV0$K6KPV115G{zk8=F*wZ@T;3&y_G!`gKB(Cm8-} z_s%MY1k&4S*UlyTXWjq{B5kI4aK-U{A3NwzZGD(kM-+g~I6PIJe}EKSKX0ZIcS}II z_=q>>-rPD3h~5g>+p`x$F!Y(Kczr%dBf6A<+W16@HOEx^gmALqAFEH55LH@iXTW^A zETFyp{PH4gPsYZFWx=FCWYM4PcUD9}R-x#PuDN$c*~VdAuGOKuI0B5_P$i77(2Eh(@e8k9r?py2ZYbMFIGo8Sp~{MZ<-8vkN1SRB-$vMWp0 zMTLD|3pCFHf57HJs?OzGF%Ro()YMOSheO+u74Wj<6?+VADIej{GDOR2X?eVf323jf zJk@Wx`)i6VeC|$lhU1!WQhy*ivT_YI;}i6E{(c-$ZpuGZ`3~rejf7$33tWcS6xmr* zbA@aaI{%lpPET=(BPC{^`4o?a65KlBfC{BKXi#c9e~W1=4M$z$QVQILynOUrh?sR} zsR2M+{V3NPX>&_O$n|akXF36~D(B1O!DXMOL`>|CW99^%a8pa^H~6HwE7FJ=Cutw5 znny+lF`0D)?Gbj>Bp<-WZb;ALYTrGuFi#o6^)FQ@CKZH-hYgc>JLUB@t8OYU-rS-=@Y&^CXNHSO@tc1@2VKYPO6H$OUTt_|_@DY&<%PgvfXSuy7>J=K zfcbnG2`E*K=b`b2-`gH11{JqPPG*e(ZjgSnW$pcBWrJtos!N+KAePB^bFJ>%dtdk z=oX?RSHQSQhe3b^fmphvFP06fbBSM1+eDdUxozN-0#U;`VvAZt{)uJrZ>c{+GcbzZ zg8A=*=`RCQ6Eg^EszdQREUMSxF$i6e;pTxRI+&egjhndw<_9-o7*wAZQ^Zr@W5*TV zf0qFt^S4Y{tyul=d-jvj2wB#uiLivWybVD??i>aYJ7#y^uVDjK1?tIjLr;(78+ws| zAgZeAW~;JV&FXO*y2u*q+apJvUoyK@-R_ERN97vICV^T)tpQ&+h7ZlPP&as?g34rS zJ}yRGfb1u=KVj@_#wcO%GW^V2HlP+9e_p-tTN|HB0}1a>`iYwk zr@MJ_MID82%|e2K04*|5Pnx9%{)s}KsRcIm}dcp5F-wJ*;n1MaLI z^;O(*;=lTXo7s~uy4MRsZ!y8k2|;cI-EUXmslleX;d$HFE$f>rZH@z9)mM#H;VZt&EY7t>CkAU8}~qgBTq@GIhB{G zW(BHZg^Zv^v2G{st;Nh-NOina3)4`1nNsw_KJXb61*F3~nI{P})`Qskf$1#M1nRqy zn`Lk7Q|tD#1fyVGVk1NoCQi>Fe_|1l2c;#TBFQ$HeN?JkVT8&jR%Wi-D}pkQsxTPc z-d*_;awBL}IR`ybG5i9tDC`t{|MxS@0Z0}VsFn_UzjlZTjA$rKwTLPATf2i<{#SnY z1As;RBT5$r`?=kbPCObk3lqwA4bI)-S{mv!`Ely_#f8?FL8jU3a%8#vf0zo#`^h@# z!oj1o+B_+J5*@ProdL-_s3<6f7g8Y9tlYcu_?hks zkode>$1E=YK|-dO)}k&oO%&%6-@h#32}-1k#64sBHzh+iv5J3dfJg&R;{I9-Nh&!) zdz-&1R)3s(xgQ3;6sR)7f3!f&l>XELM((OIK*OR>J1{2)`6D4IMFMIil7L^;dIO6( zd3Yiu{MqCRPfJgclY$F2T&7&GC&42XD3j55`0}P1|X!Pe@ZgMd0McHpD2H- zzig^=Z|?2WOINrB_*t_UA?tQy|Ab7`9FX~&IJ|COM!#pwO!fq{#78+^vJ7m&O;3p2 z=^TjOC~g_F?@Yno$sYm^5k<+rB_U8of&rkyf)YU44%5{ijvsnKC5ws?Vtsm#UOvON zZ@S#p1pz%;K(Ffnf8}l>z*_v^6zn7*w6)#jq;N7X(Chmb)Csm^&B~*8*Xh+V%W;j) zBt=E9oHs1Axf45)dn_rF3N-R`%A!nL% zIBYlTc#+|*^ta2F2Arf?)uaM}5@veXGYrp-+;?d%R=X6lsrNX3fn$*OB4uLe;Fgsg ziD&+93mEGWO%sBG7=1xLn}jdxRfU~UK-e+WK)02{)H9+YZfy&)uc@*I0ZA~Vw&RQl z@dtEpWS3lnew9YiWY>hyO-O;f_BJUqj39=$1`ox*j zTqbCQ!;+pjyV6ksU*9cBAB1Ncy(7kL%3R<`Ez9W@vbYIPBzsqtZjqm9l8ubn!%QVB z_qrOie&0b+C?w@;hG$%*C~lxBTDnD~q+*A(lhN$@hA1MG7{Z;8`zf2gwUAjS2bf2b zhC92Me>q5it{%c_Y__z*JtJ9)uHqKNnHbPTjFM)Z-AbybQn9d<*n><5sWWIQv8xER z<~jsG0BIMpr?HZ`)BD@dLdie_q*XX%n_vU8={>9wU3XJ)mMac`j3R5~CgpA5R@|(} z_1O0^kq}FS4VM=na4!v_r(rz|ZW~EB1NojYe;gn~kmcTEN+2!By0ji4o4r9DS3xQm zh=FuhOC(rwnh*vxjH`PfB?z#fi{X?1_1ZN7wrM8!p&UU_Y*A9+vlz+Zx!pu6rDF}r zT5*%7GXumavsr|!DvciR%h;(0Mn2b)=U~KVI#he7Yh}LMM9#nL<({I&1paj?8LvUF ze+MN!3`UxT3EuQvP}VU5X3=p)k*iWx^Af&W%g_%6AAwms*5j3|c4|Rn$nd7H;#mgn z2aUOvA<%@c@2`JG{@{O@f0hK&>8v;Z?9eHSm~hgg_p zhU{@7w_G2$1Diiv`$y&cARO5N?UG}ufA(eV(GNZ#`f|T-J)+7dH%!=noQV6N%^DIl zKSUeo=BrV3m^!0}8)U{VW%loHktLVYhf*D!nJ^dW{h-8v~i>Q|Mp^h9bAe+ht9w1&Mhf0nJXE2O>eqKIT5e{}c9 z*5pIiF)g77=0FF3uFB1#i=%tvSuQu6uZI4&F!fip{ov4G-&tO;zakcSdW^`Wo_Ab< zlv+=N-k<5;oW76V?@Qd^{$Mqy$095+O#CoBxbxOJ5sQ^G=zZug(NJfF21_|b4gz>u zDPZp@U9H|C6fdOO4*4Rd=N$ZqezBxgJAg9(T$D-!EVs^*$Igy$yIU>|*eT-7mD`7j zy#u?453uU7|ByM6z)6`mv!yj6SlRSyrsIbBP)eF8x`X^EGyiNw6YQyCi1ddRB@ku^;0w^EgXp~R+%&Z2VuwDU&-G6*y{YsPo delta 5916 zcmV+%7vt#QPUB7rD77#Bq=Op)0G+W4Mi2pylTi>pe@^~`@8BpuLT9)%vEdfSPj^C; z>HB1M3q*J%KnyLAWel;O@}zlN!#@0CG^Y#n`b&ELX9Ivk%L<$j$>dr=oOQ(;j|gwVSh z2X0~~Qr}!bcfP>GE|~!4Mr(=lC$P{<%U@c99Ek63#P)u}5!jvt0%K=c{;%Y`pwaL@ ze|})>6?rK$Rletj#H{F_;(t!%a%nC1HONa$@CO828cCJL^6Q>fepAdG;hjfeMKab} zppuLUBISGY@9Hkazs#a6&VR8u|B_3k)-CQD&QFW2^J#~HjM{5+!w4Z9;`?^5QB^a% zaQ#WOp=}q#_q&SXL8=Y}rH~L*%y$zUfBUW>#q8^)Dgj{^&7#eY2P80Z3cMG|a}0ad zjFwJFA__cdMZs^TL%!io3#?REhdz438rnd`cZzH#Lz6-%|KhF5Hf9UbU z1zW+}P4%oU1vx3Fq%^5S;BGK2^Xt}|d>66x0Ty`?bNn)J1h7?`NPm*7*5~aEFMPAp zZ71xRoHed=X)uk?Di&Z6NP+2pw>U9vc`?i;^ePCj2X`g9;IMJtvj$TLkpV*Y7o48? zzk2K&D*%U1Lgn6viTK?{@`RnIe~)H|yNA@jtg#s|U>v4d02SIrBaeQLoGUpleQp0l za#f5)+K_Q#q2|IMTld4Bo^MaRJLFu^wP2QsdjDu)VJ*5VxpSbh(1D-z8XkU60|TZS zCi=J>0h*AX7foWsk+RxdQ}w_%J?ZlwQz=VPTebAKxmRI>8Yp<~(U1xFf9aCMfQ#n^ zg*L30f3ulF7(W`Q@e(GSxSnqAKoN~%`&VARrGc?Bam~F@f}Q$TH&D%FulH+4g%$nm zhNs|uX?>q7XlAy2yMR;hjT3(%yo!h4INAMNRqz>6(dN8~MkS45NMSdxBW88UzK;dE3Y5kXBvaHAQ3&3(8P0)B`!_!H z9Ym%OgqkRc7WHo|e>S*{SXWVmq(x7| z_N^&g7M!#!(sd$|5pIQigr0aN_J*WWIWEDn)uh?I*wg}oyOZW4?|B725E`t!>meSo zfr3G(dk78kVH`vU91svuPp_kM@ku0nYoBQDOMf}p63)V;emuzHmJ2E@xke8U&aa6J z1@m+?S(!cxxBXx5e@;P3ulgv?S+FCoIOD8mj!GHcz(dlmjIi!A{4d{5PRD3L<2jZ~ zpO4niT$x*Kn|XxRfd;+ag-g8asV=oC_&sKz()ErQLpq>KmNTJG^l@*3pS+MdhGmOL ziH<1bas$bCWNMYe&`jl~$(PV>D8NHSr#Kc-c6~%?pPh~)e*n5d(Hp%vR~(kA$9r>E z*b>5{3nhy#CJcBQFeC&eNlL3ZEP55;8_+$xoaJtZbphN>`u%2T*$apJGTPbdQaw~2 zdcqeUD+#!u)u^}YlRdYbCh$1c^O0uVUc>7R4q<8XiNy`;S%igA)!@EKWdurrHpf1B<}Q~*g&VP^ZAh)vdm+_f!Y>1Cv#adFg z%;^=DApzy@eneNaOb)YdSAFrLL<_4ocml<>pD$onT~LUT#2acx8h3)DVdA$Po4L5? z)UQOu#c z#7C+IkP^@sLgd(hRo57E^99hKQ6G_N73~S`e?;49B!o&GllWV>P5w2wM?_nvjKva# zLuLWSZJS+^2ZUfm8>$$r1E`Ie!uJ9sYv~{fc{>}SJ6_kDyn^g~U^2%GAgQ3Q1=x*= z-%F^J-1Jb;$+Eh<3uoNdW8iOddcOezYZp!Fv&<%DG&_RfBbC_N^on|&C&+2Y3KgpK zf2>=Hc(BKpM$Wtsl2C6(tq-or+scwL&ryaJ<#%m3Xa4w^RzQ2Q6%V*AnlAJLN*j>q zuJF+GR8_%`(~Zd=%YiaT?FUk={2>Ziv{ne60O5DMJGY?aP!*Tqpf*+SfK=}pF}jJ4 z`rrT}Xo=e{yCacIv4J7;bxN|9Fqua-fATn~84{KVu2r^~mbaS#I};%DDIDH=MkDTn zS)>-Q7vT3Cze*8PswtXtzO(xnChR|niMjm^-7?BlheXWXt8`I!PX|J2VDF?HE~4m* zmZzeW1YxA)b|m@iI|YNs z#jGeYKe^crt!vQfZ-QJ2UcQGZ*R5Jl#DVrIdB-Zt`!p$v7 z%#pt1Mk|3RTw|rUok)XG*bkO^$!Fk#bBonp{#Y2Mn^8cNGRfR4TdG=>eehijP75{J zk4$)l3wihzP?jOQI^cskqNxHHe`+1CO46@B zcvz9?ek9SzUKzlm5ltrjO(b+aV}0iIa{ee#1*})FOB+V z-sM`+A4or`m2 zFb-3F0Y2KqZW{ZJK_ufVfB!yzAj@g&yf{L>ahuz(6ucaVWn1R>wBW|ZKUZPOH6iCE zajf{H8>qAAM0-^o7pwDe%wj)g`JG)J&#vd?2gSh#q6M*cW0Q*TO?RLBxe{hdzfK79 z1jAqL-dV+vKzcjv+PP%^%o|`qq|Fo$t~lQBV+Z}Itq-&6hyt(~e}|{a6Of|o=gm~& zZV4zCAMxhgn_H&=(OV&Vd-j3|hCWjjug?c*M3*v98=pwA=9r3~5KcDyWA&*LqDqVH z446-s1+=%HUtXl`$=LX?ESMCCEc&zk&Wb3=DipoZHTTXa+c>PtwK|j+M}V;#s)X@% zoR>%nrO*dB@$psLe<|U7!n@qpr%}z$gNhM{&ME^#6|`d~90wxi5NerPzQNb=;hL8+ zKE33>*1lNspztLSWJ$I==hGJLOmR<5CDe1iVY-;X28P5GxP-vOPmkuZ#Wfy)q^ zB0Fnpu8@sF=l{~y=_xL8q{QqqpW@L_f?FpXP@yyje+^1)XEAN1;izj|N`c#umyezc z5wq?rH2`R6{C%$%STZfYt02A@=SMH(^V zB<(|0^T_BRCbN#9J;JV<WNM4D+ zIkzJVN}a^>A(`$80|&20KNE@z8m170@;1*Cf2n1N=)zw|D(AbmI_ilIy2CDto`D)< zIhKeG-9nV)3K%!(FbJ?95KEWz#j;^_F7fMWn<$ekw+)<9AZj>AY*CBIKd~(SE%j$; z21fB)F#mlp{bgWkVg^A?btry^MfEy72B9l5+&s`k2eXr`aWhxI{NP3mgX;5Qig+q~ zf9$xz`!e8T{+21L6{{b9&wer*Anv$L#L=HEf`&Ks|YG=;@Js zLoX5#L{&B2Y*kjPSv_t;7g=L{d*rC|OJ=vK+g;J^s9ZzYBv4DJHQ)=!@S(XD>IN@V zP?=24$Hk}%ko~0gCybrV7$po|hM#%Me+JZo!>bp5YvWUCAYnhuKpCVEN9zrZ9Uetf zKXKFHbT?0~sH5<$Sx8gm4hTke_@baY{Y|t>fOMEA^CW@BdJtPbFr8(Z zKz%oIv+QksYTbU8U=*xNY=mgSf5hn-L@XlmptJ;3B-tjjk4lv*j8OT+%FK0pMNsBZ z6$Yc*yDMKpZUn6==b%R_hF>5Sg`J}B|9*x!0Lh{P)zV?_*A6j(5eX7Ga#7<6$Pd6LJEYMHC#Y^p!w;Hw>%EsmoHyE`XDg`OarA7 zKhs?S5}#M=n8oEkNXQh^TGXYciQ-)1`UZ}V5h>W_0T_rt)Ke*#rTm=>s+(w|zu$X!(iXjt@V2j=7;e;!60g0HK^e*lDZR7r+7PYagu z6Xj3!mrZr<&Aokk=?b?1KWi2vWZiD;pOA@~12TUThu7`P==Y47$)13g_$bFqmVqs} z=?RfLodeMu#Vv#OohjHm`9r`VqA2;dBm~MxFaT6oPy#61VY>Rm@k1}DWKl6ftWWRJ z%V*g3O_$rcAfQJJf9Q1`pxjLaSc@N=f}I3}wziv`6i(&^dVT+bI>DB#S$Wj%I=xzE zIj+%}1pOk8iH6QWYLf_|Cx>Xfozi36Vz0yr3RB8fwUO&{><>+LZ;<>N2!btP=#B%lt!+W}HC5IiAPI)l zcAOC*{(ugSf9#TL@C1Sg;_1QouI|N{wbhwO6@ht~)|sZ4tq};ZJ34kq$@fCgYaylcf`0&nF}1LWjVb<7B}IEWbdky9ED&jL;&>u zXh*!r_rp%P0Y80}71m5`6}HQC8|e;Z&LAx!=B*u^pT(UvXL=+ zn5kstURQ(G?>i_8g`|AV@QkYz#SJt?OSg!WRP2y;GMZiA5JiL%L%8#CKV{Rm7BcJP z0P|?le{g3PGY1LK)k9c~&6ZZUXCzC}RosF&69c-4QPQllTS@g)Di)R!dywfMbp}l( zb`_!4T!#P%Anii-G*&WqdVd>QC>dygvuxH}a>W6VQDlwWq`VE> ziklU=9{XM<5@Lz4;qn3m?xjKWG^~fgZ6gV1e<0s8h67{>vfO)238V#Cm)0X>vp1;Y zDo6zbF_7+Ri3Ced6T+Z|adi)*1OXOwF?{mBUb`m1HqGQdlp_d=ElLV}79&|ax0^_% zbgUs+D{k_1W`HNRr(~pfP0%nL{ z6DJMl?vgC3a@vd4)*9UntbRwrX}>i9O&TBRk>MoaddAy%jJgi)zJSIrv9q79~>I&JIf39SHvPuj}f`l z^NuT!QtN5Z`!oHU)A!N)eTf^~AFSr|ScCBG8tSajU@51_ zK>$xH1?)YgtJPbC;)PV(Az$S5f1HCKF_yGy2TN=Xw%caR@t=AW&o{DXh2d%L0j y>m#qeGjkv2pAM(ZFY+K$ZMYnRz!9o1KFKFf0ObQ5jq)j