diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index f3f92bdca..0a8333ce2 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -3,11 +3,11 @@ #include "silo/config/database_config.h" #include "silo/config/preprocessing_config.h" #include "silo/preprocessing/preprocessing_database.h" +#include "silo/storage/pango_lineage_alias.h" #include "silo/storage/reference_genomes.h" namespace silo { class Database; -class PangoLineageAliasLookup; namespace preprocessing { @@ -18,12 +18,14 @@ class Preprocessor { config::DatabaseConfig database_config; PreprocessingDatabase preprocessing_db; ReferenceGenomes reference_genomes_; + PangoLineageAliasLookup alias_lookup_; public: Preprocessor( const config::PreprocessingConfig preprocessing_config, const config::DatabaseConfig database_config, - const ReferenceGenomes& reference_genomes + const ReferenceGenomes& reference_genomes, + const PangoLineageAliasLookup& alias_lookup ); Database preprocess(); @@ -65,7 +67,6 @@ class Preprocessor { Database buildDatabase( const preprocessing::Partitions& partition_descriptor, const std::string& order_by_clause, - const silo::PangoLineageAliasLookup& alias_key, const std::filesystem::path& intermediate_results_directory ); diff --git a/include/silo/storage/pango_lineage_alias.h b/include/silo/storage/pango_lineage_alias.h index dc987b4c3..412b4b02c 100644 --- a/include/silo/storage/pango_lineage_alias.h +++ b/include/silo/storage/pango_lineage_alias.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,7 @@ class PangoLineageAliasLookup { ) const; static silo::PangoLineageAliasLookup readFromFile( - const std::filesystem::path& pango_lineage_alias_file + const std::optional& pango_lineage_alias_file ); }; diff --git a/include/silo/test/query_fixture.test.h b/include/silo/test/query_fixture.test.h index e03f1ba10..2bd781c85 100644 --- a/include/silo/test/query_fixture.test.h +++ b/include/silo/test/query_fixture.test.h @@ -16,6 +16,7 @@ #include "silo/preprocessing/preprocessor.h" #include "silo/preprocessing/sql_function.h" #include "silo/query_engine/query_engine.h" +#include "silo/storage/pango_lineage_alias.h" #include "silo/storage/reference_genomes.h" namespace silo::test { @@ -67,6 +68,7 @@ struct QueryTestData { const std::vector ndjson_input_data; const silo::config::DatabaseConfig database_config; const silo::ReferenceGenomes reference_genomes; + const silo::PangoLineageAliasLookup alias_lookup; }; struct QueryTestScenario { @@ -117,7 +119,10 @@ class QueryTestFixture : public ::testing::TestWithParam { file.close(); silo::preprocessing::Preprocessor preprocessor( - config_with_input_dir, test_data.database_config, test_data.reference_genomes + config_with_input_dir, + test_data.database_config, + test_data.reference_genomes, + test_data.alias_lookup ); DataContainer::database = std::make_unique(preprocessor.preprocess()); diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 79e1271eb..d82bd51e2 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -26,7 +26,12 @@ silo::Database buildTestDatabase() { const auto reference_genomes = silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename()); - silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes); + const auto alias_lookup = + silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename()); + + silo::preprocessing::Preprocessor preprocessor( + config, database_config, reference_genomes, alias_lookup + ); return preprocessor.preprocess(); } @@ -52,7 +57,12 @@ TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) { const auto reference_genomes = silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename()); - silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes); + const auto alias_lookup = + silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename()); + + silo::preprocessing::Preprocessor preprocessor( + config, database_config, reference_genomes, alias_lookup + ); auto database = preprocessor.preprocess(); const auto simple_database_info = database.getDatabaseInfo(); diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index fe19b02b8..dbc3a5844 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -28,12 +28,14 @@ constexpr std::string_view FASTA_EXTENSION = ".fasta"; Preprocessor::Preprocessor( config::PreprocessingConfig preprocessing_config_, config::DatabaseConfig database_config_, - const ReferenceGenomes& reference_genomes + const ReferenceGenomes& reference_genomes, + const PangoLineageAliasLookup& alias_lookup ) : preprocessing_config(std::move(preprocessing_config_)), database_config(std::move(database_config_)), preprocessing_db(preprocessing_config.getPreprocessingDatabaseLocation(), reference_genomes), - reference_genomes_(reference_genomes) {} + reference_genomes_(reference_genomes), + alias_lookup_(alias_lookup) {} Database Preprocessor::preprocess() { SPDLOG_INFO( @@ -50,14 +52,6 @@ Database Preprocessor::preprocess() { throw silo::preprocessing::PreprocessingException(error); } - SPDLOG_INFO("preprocessing - building alias key"); - const auto pango_lineage_definition_filename = - preprocessing_config.getPangoLineageDefinitionFilename(); - PangoLineageAliasLookup alias_key; - if (pango_lineage_definition_filename.has_value()) { - alias_key = PangoLineageAliasLookup::readFromFile(pango_lineage_definition_filename.value()); - } - const auto& ndjson_input_filename = preprocessing_config.getNdjsonInputFilename(); if (ndjson_input_filename.has_value()) { SPDLOG_INFO("preprocessing - ndjson pipeline chosen"); @@ -92,10 +86,7 @@ Database Preprocessor::preprocess() { SPDLOG_INFO("preprocessing - building database"); preprocessing_db.refreshConnection(); return buildDatabase( - partition_descriptor, - order_by_clause, - alias_key, - preprocessing_config.getIntermediateResultsDirectory() + partition_descriptor, order_by_clause, preprocessing_config.getIntermediateResultsDirectory() ); } @@ -462,12 +453,11 @@ void Preprocessor::createPartitionedTableForSequence( Database Preprocessor::buildDatabase( const preprocessing::Partitions& partition_descriptor, const std::string& order_by_clause, - const silo::PangoLineageAliasLookup& alias_key, const std::filesystem::path& intermediate_results_directory ) { Database database; database.database_config = database_config; - database.alias_key = alias_key; + database.alias_key = alias_lookup_; database.intermediate_results_directory = intermediate_results_directory; const DataVersion& data_version = DataVersion::mineDataVersion(); SPDLOG_INFO("preprocessing - mining data data_version: {}", data_version.toString()); diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 7f90ca374..00c81b994 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -118,7 +118,11 @@ TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) { const auto reference_genomes = silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename()); - silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes); + const auto alias_lookup = + silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename()); + silo::preprocessing::Preprocessor preprocessor( + config, database_config, reference_genomes, alias_lookup + ); auto database = preprocessor.preprocess(); const auto database_info = database.getDatabaseInfo(); diff --git a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp index d6b4c45ca..94d9c6bb3 100644 --- a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp +++ b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp @@ -73,8 +73,8 @@ void from_json(const nlohmann::json& json, std::unique_ptr& json.contains("value"), "The field 'value' is required in a PangoLineage expression" ) CHECK_SILO_QUERY( - json["value"].is_string(), - "The field 'value' in a PangoLineage expression needs to be a string" + json["value"].is_string() || json["value"].is_null(), + "The field 'value' in a PangoLineage expression needs to be a string or null" ) CHECK_SILO_QUERY( json.contains("includeSublineages"), @@ -85,7 +85,7 @@ void from_json(const nlohmann::json& json, std::unique_ptr& "The field 'includeSublineages' in a PangoLineage expression needs to be a boolean" ) const std::string& column = json["column"]; - const std::string& lineage = json["value"]; + const std::string& lineage = json["value"].is_null() ? "" : json["value"].get(); const bool include_sublineages = json["includeSublineages"]; filter = std::make_unique(column, lineage, include_sublineages); } diff --git a/src/silo/storage/pango_lineage_alias.cpp b/src/silo/storage/pango_lineage_alias.cpp index b1dd40958..eda306a8f 100644 --- a/src/silo/storage/pango_lineage_alias.cpp +++ b/src/silo/storage/pango_lineage_alias.cpp @@ -100,25 +100,32 @@ std::unordered_map> readFromJson( } // namespace silo::PangoLineageAliasLookup silo::PangoLineageAliasLookup::readFromFile( - const std::filesystem::path& pango_lineage_alias_file + const std::optional& pango_lineage_alias_file ) { - if (!std::filesystem::exists(pango_lineage_alias_file)) { + if (!pango_lineage_alias_file.has_value()) { + SPDLOG_INFO("No pango lineage alias file provided. Using empty alias lookup."); + return {}; + } + + const auto& pango_lineage_alias_path = pango_lineage_alias_file.value(); + + if (!std::filesystem::exists(pango_lineage_alias_path)) { throw std::filesystem::filesystem_error( - "Alias key file " + pango_lineage_alias_file.string() + " does not exist", + "Alias key file " + pango_lineage_alias_path.string() + " does not exist", std::error_code() ); } - if (pango_lineage_alias_file.extension() != ".json") { + if (pango_lineage_alias_path.extension() != ".json") { throw std::filesystem::filesystem_error( - "Alias key file " + pango_lineage_alias_file.string() + " is not a json file", + "Alias key file " + pango_lineage_alias_path.string() + " is not a json file", std::error_code() ); } - SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_file.string()); + SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_path.string()); - return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_file)); + return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_path)); } } // namespace silo diff --git a/src/silo/test/pango_lineage_filter.test.cpp b/src/silo/test/pango_lineage_filter.test.cpp new file mode 100644 index 000000000..f2bee332a --- /dev/null +++ b/src/silo/test/pango_lineage_filter.test.cpp @@ -0,0 +1,116 @@ +#include + +#include + +#include "silo/test/query_fixture.test.h" + +using silo::PangoLineageAliasLookup; +using silo::ReferenceGenomes; +using silo::config::DatabaseConfig; +using silo::config::ValueType; +using silo::test::QueryTestData; +using silo::test::QueryTestScenario; + +static const std::string SOME_BASE_PANGO_LINEAGE = "P.1"; +static const std::string SOME_SUBLINEAGE = "A.1"; + +nlohmann::json createDataWithPangoLineageValue(const std::string& primaryKey, std::string value) { + return { + {"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", value}}}, + {"alignedNucleotideSequences", {{"segment1", nullptr}}}, + {"unalignedNucleotideSequences", {{"segment1", nullptr}}}, + {"alignedAminoAcidSequences", {{"gene1", nullptr}}} + }; +} + +nlohmann::json createDataWithPangoLineageNullValue(const std::string& primaryKey) { + return { + {"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", nullptr}}}, + {"alignedNucleotideSequences", {{"segment1", nullptr}}}, + {"unalignedNucleotideSequences", {{"segment1", nullptr}}}, + {"alignedAminoAcidSequences", {{"gene1", nullptr}}} + }; +} +const std::vector DATA = { + createDataWithPangoLineageValue("id_0", SOME_BASE_PANGO_LINEAGE), + createDataWithPangoLineageValue("id_1", SOME_BASE_PANGO_LINEAGE), + createDataWithPangoLineageValue("id_2", SOME_SUBLINEAGE), + createDataWithPangoLineageNullValue("id_3") +}; + +const auto DATABASE_CONFIG = DatabaseConfig{ + .default_nucleotide_sequence = "segment1", + .schema = + {.instance_name = "dummy name", + .metadata = + {{.name = "primaryKey", .type = ValueType::STRING}, + {.name = "pango_lineage", .type = ValueType::PANGOLINEAGE, .generate_index = true}}, + .primary_key = "primaryKey"} +}; + +const auto REFERENCE_GENOMES = ReferenceGenomes{ + {{"segment1", "A"}}, + {{"gene1", "*"}}, +}; + +const auto ALIAS_LOOKUP = PangoLineageAliasLookup{{{"A", {"P.1"}}}}; + +const QueryTestData TEST_DATA{ + .ndjson_input_data = {DATA}, + .database_config = DATABASE_CONFIG, + .reference_genomes = REFERENCE_GENOMES, + .alias_lookup = ALIAS_LOOKUP +}; + +nlohmann::json createPangoLineageQuery(const nlohmann::json value, bool include_sublineages) { + return { + {"action", {{"type", "Details"}}}, + {"filterExpression", + {{"type", "PangoLineage"}, + {"column", "pango_lineage"}, + {"value", value}, + {"includeSublineages", include_sublineages}}} + }; +} + +const QueryTestScenario PANGO_LINEAGE_FILTER_SCENARIO = { + .name = "pangoLineageFilter", + .query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, false), + .expected_query_result = nlohmann::json( + {{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}, + {{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}} + ) +}; + +const QueryTestScenario PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO = { + .name = "pangoLineageFilterIncludingSublineages", + .query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, true), + .expected_query_result = nlohmann::json( + {{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}, + {{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}, + {{"primaryKey", "id_2"}, {"pango_lineage", "P.1.1"}}} + ) +}; + +const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_SCENARIO = { + .name = "pangoLineageFilterNull", + .query = createPangoLineageQuery(nullptr, false), + .expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}}) +}; + +const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO = { + .name = "pangoLineageFilterNullIncludingSublineages", + .query = createPangoLineageQuery(nullptr, true), + .expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}}) +}; + +QUERY_TEST( + PangoLineageFilterTest, + TEST_DATA, + ::testing::Values( + PANGO_LINEAGE_FILTER_SCENARIO, + PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO, + PANGO_LINEAGE_FILTER_NULL_SCENARIO, + PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO + ) +); diff --git a/src/silo_api/api.cpp b/src/silo_api/api.cpp index 102944aef..f845356fc 100644 --- a/src/silo_api/api.cpp +++ b/src/silo_api/api.cpp @@ -254,8 +254,13 @@ class SiloServer : public Poco::Util::ServerApplication { const auto reference_genomes = silo::ReferenceGenomes::readFromFile(preprocessing_config.getReferenceGenomeFilename()); + SPDLOG_INFO("preprocessing - reading pango lineage alias"); + const auto alias_lookup = silo::PangoLineageAliasLookup::readFromFile( + preprocessing_config.getPangoLineageDefinitionFilename() + ); + auto preprocessor = silo::preprocessing::Preprocessor( - preprocessing_config, database_config, reference_genomes + preprocessing_config, database_config, reference_genomes, alias_lookup ); return preprocessor.preprocess();