Skip to content

Commit

Permalink
fix: pango lineage filter with null values
Browse files Browse the repository at this point in the history
  • Loading branch information
JonasKellerer committed May 2, 2024
1 parent 47b436e commit 0b604bb
Show file tree
Hide file tree
Showing 10 changed files with 176 additions and 37 deletions.
11 changes: 6 additions & 5 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"

namespace silo {
class Database;
class PangoLineageAliasLookup;

namespace preprocessing {

Expand All @@ -18,12 +18,14 @@ class Preprocessor {
config::DatabaseConfig database_config;
PreprocessingDatabase preprocessing_db;
ReferenceGenomes reference_genomes_;
PangoLineageAliasLookup alias_lookup_;

public:
Preprocessor(
const config::PreprocessingConfig preprocessing_config,
const config::DatabaseConfig database_config,
const ReferenceGenomes& reference_genomes
config::PreprocessingConfig preprocessing_config,
config::DatabaseConfig database_config,
const ReferenceGenomes& reference_genomes,
PangoLineageAliasLookup alias_lookup
);

Database preprocess();
Expand Down Expand Up @@ -65,7 +67,6 @@ class Preprocessor {
Database buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const silo::PangoLineageAliasLookup& alias_key,
const std::filesystem::path& intermediate_results_directory
);

Expand Down
3 changes: 2 additions & 1 deletion include/silo/storage/pango_lineage_alias.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <cstdint>
#include <filesystem>
#include <functional>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>
Expand Down Expand Up @@ -49,7 +50,7 @@ class PangoLineageAliasLookup {
) const;

static silo::PangoLineageAliasLookup readFromFile(
const std::filesystem::path& pango_lineage_alias_file
const std::optional<std::filesystem::path>& pango_lineage_alias_file
);
};

Expand Down
7 changes: 6 additions & 1 deletion include/silo/test/query_fixture.test.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "silo/preprocessing/preprocessor.h"
#include "silo/preprocessing/sql_function.h"
#include "silo/query_engine/query_engine.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"

namespace silo::test {
Expand Down Expand Up @@ -67,6 +68,7 @@ struct QueryTestData {
const std::vector<nlohmann::json> ndjson_input_data;
const silo::config::DatabaseConfig database_config;
const silo::ReferenceGenomes reference_genomes;
const silo::PangoLineageAliasLookup alias_lookup;
};

struct QueryTestScenario {
Expand Down Expand Up @@ -117,7 +119,10 @@ class QueryTestFixture : public ::testing::TestWithParam<QueryTestScenario> {
file.close();

silo::preprocessing::Preprocessor preprocessor(
config_with_input_dir, test_data.database_config, test_data.reference_genomes
config_with_input_dir,
test_data.database_config,
test_data.reference_genomes,
test_data.alias_lookup
);
DataContainer::database = std::make_unique<Database>(preprocessor.preprocess());

Expand Down
14 changes: 12 additions & 2 deletions src/silo/database.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ silo::Database buildTestDatabase() {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());

silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
return preprocessor.preprocess();
}

Expand All @@ -52,7 +57,12 @@ TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());

silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
auto database = preprocessor.preprocess();

const auto simple_database_info = database.getDatabaseInfo();
Expand Down
22 changes: 6 additions & 16 deletions src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ constexpr std::string_view FASTA_EXTENSION = ".fasta";
Preprocessor::Preprocessor(
config::PreprocessingConfig preprocessing_config_,
config::DatabaseConfig database_config_,
const ReferenceGenomes& reference_genomes
const ReferenceGenomes& reference_genomes,
PangoLineageAliasLookup alias_lookup
)
: preprocessing_config(std::move(preprocessing_config_)),
database_config(std::move(database_config_)),
preprocessing_db(preprocessing_config.getPreprocessingDatabaseLocation(), reference_genomes),
reference_genomes_(reference_genomes) {}
reference_genomes_(reference_genomes),
alias_lookup_(std::move(alias_lookup)) {}

Database Preprocessor::preprocess() {
SPDLOG_INFO(
Expand All @@ -50,14 +52,6 @@ Database Preprocessor::preprocess() {
throw silo::preprocessing::PreprocessingException(error);
}

SPDLOG_INFO("preprocessing - building alias key");
const auto pango_lineage_definition_filename =
preprocessing_config.getPangoLineageDefinitionFilename();
PangoLineageAliasLookup alias_key;
if (pango_lineage_definition_filename.has_value()) {
alias_key = PangoLineageAliasLookup::readFromFile(pango_lineage_definition_filename.value());
}

const auto& ndjson_input_filename = preprocessing_config.getNdjsonInputFilename();
if (ndjson_input_filename.has_value()) {
SPDLOG_INFO("preprocessing - ndjson pipeline chosen");
Expand Down Expand Up @@ -92,10 +86,7 @@ Database Preprocessor::preprocess() {
SPDLOG_INFO("preprocessing - building database");
preprocessing_db.refreshConnection();
return buildDatabase(
partition_descriptor,
order_by_clause,
alias_key,
preprocessing_config.getIntermediateResultsDirectory()
partition_descriptor, order_by_clause, preprocessing_config.getIntermediateResultsDirectory()
);
}

Expand Down Expand Up @@ -462,12 +453,11 @@ void Preprocessor::createPartitionedTableForSequence(
Database Preprocessor::buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const silo::PangoLineageAliasLookup& alias_key,
const std::filesystem::path& intermediate_results_directory
) {
Database database;
database.database_config = database_config;
database.alias_key = alias_key;
database.alias_key = alias_lookup_;
database.intermediate_results_directory = intermediate_results_directory;
const DataVersion& data_version = DataVersion::mineDataVersion();
SPDLOG_INFO("preprocessing - mining data data_version: {}", data_version.toString());
Expand Down
6 changes: 5 additions & 1 deletion src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,11 @@ TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());
silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
auto database = preprocessor.preprocess();

const auto database_info = database.getDatabaseInfo();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ void from_json(const nlohmann::json& json, std::unique_ptr<PangoLineageFilter>&
json.contains("value"), "The field 'value' is required in a PangoLineage expression"
)
CHECK_SILO_QUERY(
json["value"].is_string(),
"The field 'value' in a PangoLineage expression needs to be a string"
json["value"].is_string() || json["value"].is_null(),
"The field 'value' in a PangoLineage expression needs to be a string or null"
)
CHECK_SILO_QUERY(
json.contains("includeSublineages"),
Expand All @@ -85,7 +85,7 @@ void from_json(const nlohmann::json& json, std::unique_ptr<PangoLineageFilter>&
"The field 'includeSublineages' in a PangoLineage expression needs to be a boolean"
)
const std::string& column = json["column"];
const std::string& lineage = json["value"];
const std::string& lineage = json["value"].is_null() ? "" : json["value"].get<std::string>();
const bool include_sublineages = json["includeSublineages"];
filter = std::make_unique<PangoLineageFilter>(column, lineage, include_sublineages);
}
Expand Down
21 changes: 14 additions & 7 deletions src/silo/storage/pango_lineage_alias.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,25 +100,32 @@ std::unordered_map<std::string, std::vector<std::string>> readFromJson(
} // namespace

silo::PangoLineageAliasLookup silo::PangoLineageAliasLookup::readFromFile(
const std::filesystem::path& pango_lineage_alias_file
const std::optional<std::filesystem::path>& pango_lineage_alias_file
) {
if (!std::filesystem::exists(pango_lineage_alias_file)) {
if (!pango_lineage_alias_file.has_value()) {
SPDLOG_INFO("No pango lineage alias file provided. Using empty alias lookup.");
return {};
}

const auto& pango_lineage_alias_path = pango_lineage_alias_file.value();

if (!std::filesystem::exists(pango_lineage_alias_path)) {
throw std::filesystem::filesystem_error(
"Alias key file " + pango_lineage_alias_file.string() + " does not exist",
"Alias key file " + pango_lineage_alias_path.string() + " does not exist",
std::error_code()
);
}

if (pango_lineage_alias_file.extension() != ".json") {
if (pango_lineage_alias_path.extension() != ".json") {
throw std::filesystem::filesystem_error(
"Alias key file " + pango_lineage_alias_file.string() + " is not a json file",
"Alias key file " + pango_lineage_alias_path.string() + " is not a json file",
std::error_code()
);
}

SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_file.string());
SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_path.string());

return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_file));
return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_path));
}

} // namespace silo
116 changes: 116 additions & 0 deletions src/silo/test/pango_lineage_filter.test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <nlohmann/json.hpp>

#include <optional>

#include "silo/test/query_fixture.test.h"

using silo::PangoLineageAliasLookup;
using silo::ReferenceGenomes;
using silo::config::DatabaseConfig;
using silo::config::ValueType;
using silo::test::QueryTestData;
using silo::test::QueryTestScenario;

static const std::string SOME_BASE_PANGO_LINEAGE = "P.1";
static const std::string SOME_SUBLINEAGE = "A.1";

nlohmann::json createDataWithPangoLineageValue(const std::string& primaryKey, std::string value) {
return {
{"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", value}}},
{"alignedNucleotideSequences", {{"segment1", nullptr}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}}},
{"alignedAminoAcidSequences", {{"gene1", nullptr}}}
};
}

nlohmann::json createDataWithPangoLineageNullValue(const std::string& primaryKey) {
return {
{"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", nullptr}}},
{"alignedNucleotideSequences", {{"segment1", nullptr}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}}},
{"alignedAminoAcidSequences", {{"gene1", nullptr}}}
};
}
const std::vector<nlohmann::json> DATA = {
createDataWithPangoLineageValue("id_0", SOME_BASE_PANGO_LINEAGE),
createDataWithPangoLineageValue("id_1", SOME_BASE_PANGO_LINEAGE),
createDataWithPangoLineageValue("id_2", SOME_SUBLINEAGE),
createDataWithPangoLineageNullValue("id_3")
};

const auto DATABASE_CONFIG = DatabaseConfig{
.default_nucleotide_sequence = "segment1",
.schema =
{.instance_name = "dummy name",
.metadata =
{{.name = "primaryKey", .type = ValueType::STRING},
{.name = "pango_lineage", .type = ValueType::PANGOLINEAGE, .generate_index = true}},
.primary_key = "primaryKey"}
};

const auto REFERENCE_GENOMES = ReferenceGenomes{
{{"segment1", "A"}},
{{"gene1", "*"}},
};

const auto ALIAS_LOOKUP = PangoLineageAliasLookup{{{"A", {"P.1"}}}};

const QueryTestData TEST_DATA{
.ndjson_input_data = {DATA},
.database_config = DATABASE_CONFIG,
.reference_genomes = REFERENCE_GENOMES,
.alias_lookup = ALIAS_LOOKUP
};

nlohmann::json createPangoLineageQuery(const nlohmann::json value, bool include_sublineages) {
return {
{"action", {{"type", "Details"}}},
{"filterExpression",
{{"type", "PangoLineage"},
{"column", "pango_lineage"},
{"value", value},
{"includeSublineages", include_sublineages}}}
};
}

const QueryTestScenario PANGO_LINEAGE_FILTER_SCENARIO = {
.name = "pangoLineageFilter",
.query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, false),
.expected_query_result = nlohmann::json(
{{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}}
)
};

const QueryTestScenario PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO = {
.name = "pangoLineageFilterIncludingSublineages",
.query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, true),
.expected_query_result = nlohmann::json(
{{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_2"}, {"pango_lineage", "P.1.1"}}}
)
};

const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_SCENARIO = {
.name = "pangoLineageFilterNull",
.query = createPangoLineageQuery(nullptr, false),
.expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}})
};

const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO = {
.name = "pangoLineageFilterNullIncludingSublineages",
.query = createPangoLineageQuery(nullptr, true),
.expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}})
};

QUERY_TEST(
PangoLineageFilterTest,
TEST_DATA,
::testing::Values(
PANGO_LINEAGE_FILTER_SCENARIO,
PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO,
PANGO_LINEAGE_FILTER_NULL_SCENARIO,
PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO
)
);
7 changes: 6 additions & 1 deletion src/silo_api/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,13 @@ class SiloServer : public Poco::Util::ServerApplication {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(preprocessing_config.getReferenceGenomeFilename());

SPDLOG_INFO("preprocessing - reading pango lineage alias");
const auto alias_lookup = silo::PangoLineageAliasLookup::readFromFile(
preprocessing_config.getPangoLineageDefinitionFilename()
);

auto preprocessor = silo::preprocessing::Preprocessor(
preprocessing_config, database_config, reference_genomes
preprocessing_config, database_config, reference_genomes, alias_lookup
);

return preprocessor.preprocess();
Expand Down

0 comments on commit 0b604bb

Please sign in to comment.