Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: pango lineage filter with null values #403

Merged
merged 1 commit into from
May 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
@@ -3,11 +3,11 @@
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"

namespace silo {
class Database;
class PangoLineageAliasLookup;

namespace preprocessing {

@@ -18,12 +18,14 @@ class Preprocessor {
config::DatabaseConfig database_config;
PreprocessingDatabase preprocessing_db;
ReferenceGenomes reference_genomes_;
PangoLineageAliasLookup alias_lookup_;

public:
Preprocessor(
const config::PreprocessingConfig preprocessing_config,
const config::DatabaseConfig database_config,
const ReferenceGenomes& reference_genomes
config::PreprocessingConfig preprocessing_config,
config::DatabaseConfig database_config,
const ReferenceGenomes& reference_genomes,
PangoLineageAliasLookup alias_lookup
);

Database preprocess();
@@ -65,7 +67,6 @@ class Preprocessor {
Database buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const silo::PangoLineageAliasLookup& alias_key,
const std::filesystem::path& intermediate_results_directory
);

3 changes: 2 additions & 1 deletion include/silo/storage/pango_lineage_alias.h
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
#include <cstdint>
#include <filesystem>
#include <functional>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>
@@ -49,7 +50,7 @@ class PangoLineageAliasLookup {
) const;

static silo::PangoLineageAliasLookup readFromFile(
const std::filesystem::path& pango_lineage_alias_file
const std::optional<std::filesystem::path>& pango_lineage_alias_file
);
};

7 changes: 6 additions & 1 deletion include/silo/test/query_fixture.test.h
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@
#include "silo/preprocessing/preprocessor.h"
#include "silo/preprocessing/sql_function.h"
#include "silo/query_engine/query_engine.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"

namespace silo::test {
@@ -67,6 +68,7 @@ struct QueryTestData {
const std::vector<nlohmann::json> ndjson_input_data;
const silo::config::DatabaseConfig database_config;
const silo::ReferenceGenomes reference_genomes;
const silo::PangoLineageAliasLookup alias_lookup;
};

struct QueryTestScenario {
@@ -117,7 +119,10 @@ class QueryTestFixture : public ::testing::TestWithParam<QueryTestScenario> {
file.close();

silo::preprocessing::Preprocessor preprocessor(
config_with_input_dir, test_data.database_config, test_data.reference_genomes
config_with_input_dir,
test_data.database_config,
test_data.reference_genomes,
test_data.alias_lookup
);
DataContainer::database = std::make_unique<Database>(preprocessor.preprocess());

14 changes: 12 additions & 2 deletions src/silo/database.test.cpp
Original file line number Diff line number Diff line change
@@ -26,7 +26,12 @@ silo::Database buildTestDatabase() {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());

silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
return preprocessor.preprocess();
}

@@ -52,7 +57,12 @@ TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());

silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
auto database = preprocessor.preprocess();

const auto simple_database_info = database.getDatabaseInfo();
22 changes: 6 additions & 16 deletions src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
@@ -28,12 +28,14 @@ constexpr std::string_view FASTA_EXTENSION = ".fasta";
Preprocessor::Preprocessor(
config::PreprocessingConfig preprocessing_config_,
config::DatabaseConfig database_config_,
const ReferenceGenomes& reference_genomes
const ReferenceGenomes& reference_genomes,
PangoLineageAliasLookup alias_lookup
)
: preprocessing_config(std::move(preprocessing_config_)),
database_config(std::move(database_config_)),
preprocessing_db(preprocessing_config.getPreprocessingDatabaseLocation(), reference_genomes),
reference_genomes_(reference_genomes) {}
reference_genomes_(reference_genomes),
alias_lookup_(std::move(alias_lookup)) {}

Database Preprocessor::preprocess() {
SPDLOG_INFO(
@@ -50,14 +52,6 @@ Database Preprocessor::preprocess() {
throw silo::preprocessing::PreprocessingException(error);
}

SPDLOG_INFO("preprocessing - building alias key");
const auto pango_lineage_definition_filename =
preprocessing_config.getPangoLineageDefinitionFilename();
PangoLineageAliasLookup alias_key;
if (pango_lineage_definition_filename.has_value()) {
alias_key = PangoLineageAliasLookup::readFromFile(pango_lineage_definition_filename.value());
}

const auto& ndjson_input_filename = preprocessing_config.getNdjsonInputFilename();
if (ndjson_input_filename.has_value()) {
SPDLOG_INFO("preprocessing - ndjson pipeline chosen");
@@ -92,10 +86,7 @@ Database Preprocessor::preprocess() {
SPDLOG_INFO("preprocessing - building database");
preprocessing_db.refreshConnection();
return buildDatabase(
partition_descriptor,
order_by_clause,
alias_key,
preprocessing_config.getIntermediateResultsDirectory()
partition_descriptor, order_by_clause, preprocessing_config.getIntermediateResultsDirectory()
);
}

@@ -462,12 +453,11 @@ void Preprocessor::createPartitionedTableForSequence(
Database Preprocessor::buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const silo::PangoLineageAliasLookup& alias_key,
const std::filesystem::path& intermediate_results_directory
) {
Database database;
database.database_config = database_config;
database.alias_key = alias_key;
database.alias_key = alias_lookup_;
database.intermediate_results_directory = intermediate_results_directory;
const DataVersion& data_version = DataVersion::mineDataVersion();
SPDLOG_INFO("preprocessing - mining data data_version: {}", data_version.toString());
6 changes: 5 additions & 1 deletion src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
@@ -118,7 +118,11 @@ TEST_P(PreprocessorTestFixture, shouldProcessDataSetWithMissingSequences) {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(config.getReferenceGenomeFilename());

silo::preprocessing::Preprocessor preprocessor(config, database_config, reference_genomes);
const auto alias_lookup =
silo::PangoLineageAliasLookup::readFromFile(config.getPangoLineageDefinitionFilename());
silo::preprocessing::Preprocessor preprocessor(
config, database_config, reference_genomes, alias_lookup
);
auto database = preprocessor.preprocess();

const auto database_info = database.getDatabaseInfo();
Original file line number Diff line number Diff line change
@@ -73,8 +73,8 @@ void from_json(const nlohmann::json& json, std::unique_ptr<PangoLineageFilter>&
json.contains("value"), "The field 'value' is required in a PangoLineage expression"
)
CHECK_SILO_QUERY(
json["value"].is_string(),
"The field 'value' in a PangoLineage expression needs to be a string"
json["value"].is_string() || json["value"].is_null(),
"The field 'value' in a PangoLineage expression needs to be a string or null"
)
CHECK_SILO_QUERY(
json.contains("includeSublineages"),
@@ -85,7 +85,7 @@ void from_json(const nlohmann::json& json, std::unique_ptr<PangoLineageFilter>&
"The field 'includeSublineages' in a PangoLineage expression needs to be a boolean"
)
const std::string& column = json["column"];
const std::string& lineage = json["value"];
const std::string& lineage = json["value"].is_null() ? "" : json["value"].get<std::string>();
const bool include_sublineages = json["includeSublineages"];
filter = std::make_unique<PangoLineageFilter>(column, lineage, include_sublineages);
}
21 changes: 14 additions & 7 deletions src/silo/storage/pango_lineage_alias.cpp
Original file line number Diff line number Diff line change
@@ -100,25 +100,32 @@ std::unordered_map<std::string, std::vector<std::string>> readFromJson(
} // namespace

silo::PangoLineageAliasLookup silo::PangoLineageAliasLookup::readFromFile(
const std::filesystem::path& pango_lineage_alias_file
const std::optional<std::filesystem::path>& pango_lineage_alias_file
) {
if (!std::filesystem::exists(pango_lineage_alias_file)) {
if (!pango_lineage_alias_file.has_value()) {
SPDLOG_INFO("No pango lineage alias file provided. Using empty alias lookup.");
return {};
}

const auto& pango_lineage_alias_path = pango_lineage_alias_file.value();

if (!std::filesystem::exists(pango_lineage_alias_path)) {
throw std::filesystem::filesystem_error(
"Alias key file " + pango_lineage_alias_file.string() + " does not exist",
"Alias key file " + pango_lineage_alias_path.string() + " does not exist",
std::error_code()
);
}

if (pango_lineage_alias_file.extension() != ".json") {
if (pango_lineage_alias_path.extension() != ".json") {
throw std::filesystem::filesystem_error(
"Alias key file " + pango_lineage_alias_file.string() + " is not a json file",
"Alias key file " + pango_lineage_alias_path.string() + " is not a json file",
std::error_code()
);
}

SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_file.string());
SPDLOG_INFO("Read pango lineage alias from file: {}", pango_lineage_alias_path.string());

return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_file));
return PangoLineageAliasLookup(readFromJson(pango_lineage_alias_path));
}

} // namespace silo
116 changes: 116 additions & 0 deletions src/silo/test/pango_lineage_filter.test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <nlohmann/json.hpp>

#include <optional>

#include "silo/test/query_fixture.test.h"

using silo::PangoLineageAliasLookup;
using silo::ReferenceGenomes;
using silo::config::DatabaseConfig;
using silo::config::ValueType;
using silo::test::QueryTestData;
using silo::test::QueryTestScenario;

static const std::string SOME_BASE_PANGO_LINEAGE = "BASE.1";
static const std::string SOME_SUBLINEAGE = "CHILD.1";

nlohmann::json createDataWithPangoLineageValue(const std::string& primaryKey, std::string value) {
return {
{"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", value}}},
{"alignedNucleotideSequences", {{"segment1", nullptr}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}}},
{"alignedAminoAcidSequences", {{"gene1", nullptr}}}
};
}

nlohmann::json createDataWithPangoLineageNullValue(const std::string& primaryKey) {
return {
{"metadata", {{"primaryKey", primaryKey}, {"pango_lineage", nullptr}}},
{"alignedNucleotideSequences", {{"segment1", nullptr}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}}},
{"alignedAminoAcidSequences", {{"gene1", nullptr}}}
};
}
const std::vector<nlohmann::json> DATA = {
createDataWithPangoLineageValue("id_0", SOME_BASE_PANGO_LINEAGE),
createDataWithPangoLineageValue("id_1", SOME_BASE_PANGO_LINEAGE),
createDataWithPangoLineageValue("id_2", SOME_SUBLINEAGE),
createDataWithPangoLineageNullValue("id_3")
};

const auto DATABASE_CONFIG = DatabaseConfig{
.default_nucleotide_sequence = "segment1",
.schema =
{.instance_name = "dummy name",
.metadata =
{{.name = "primaryKey", .type = ValueType::STRING},
{.name = "pango_lineage", .type = ValueType::PANGOLINEAGE, .generate_index = true}},
.primary_key = "primaryKey"}
};

const auto REFERENCE_GENOMES = ReferenceGenomes{
{{"segment1", "A"}},
{{"gene1", "*"}},
};

const auto ALIAS_LOOKUP = PangoLineageAliasLookup{{{"CHILD", {"BASE.1.1.1"}}}};

const QueryTestData TEST_DATA{
.ndjson_input_data = {DATA},
.database_config = DATABASE_CONFIG,
.reference_genomes = REFERENCE_GENOMES,
.alias_lookup = ALIAS_LOOKUP
};

nlohmann::json createPangoLineageQuery(const nlohmann::json value, bool include_sublineages) {
return {
{"action", {{"type", "Details"}}},
{"filterExpression",
{{"type", "PangoLineage"},
{"column", "pango_lineage"},
{"value", value},
{"includeSublineages", include_sublineages}}}
};
}

const QueryTestScenario PANGO_LINEAGE_FILTER_SCENARIO = {
.name = "pangoLineageFilter",
.query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, false),
.expected_query_result = nlohmann::json(
{{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}}}
)
};

const QueryTestScenario PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO = {
.name = "pangoLineageFilterIncludingSublineages",
.query = createPangoLineageQuery(SOME_BASE_PANGO_LINEAGE, true),
.expected_query_result = nlohmann::json(
{{{"primaryKey", "id_0"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_1"}, {"pango_lineage", SOME_BASE_PANGO_LINEAGE}},
{{"primaryKey", "id_2"}, {"pango_lineage", SOME_SUBLINEAGE}}}
)
};

const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_SCENARIO = {
.name = "pangoLineageFilterNull",
.query = createPangoLineageQuery(nullptr, false),
.expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}})
};

const QueryTestScenario PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO = {
.name = "pangoLineageFilterNullIncludingSublineages",
.query = createPangoLineageQuery(nullptr, true),
.expected_query_result = nlohmann::json({{{"primaryKey", "id_3"}, {"pango_lineage", nullptr}}})
};

QUERY_TEST(
PangoLineageFilterTest,
TEST_DATA,
::testing::Values(
PANGO_LINEAGE_FILTER_SCENARIO,
PANGO_LINEAGE_FILTER_INCLUDING_SUBLINEAGES_SCENARIO,
PANGO_LINEAGE_FILTER_NULL_SCENARIO,
PANGO_LINEAGE_FILTER_NULL_INCLUDING_SUBLINEAGES_SCENARIO
)
);
7 changes: 6 additions & 1 deletion src/silo_api/api.cpp
Original file line number Diff line number Diff line change
@@ -254,8 +254,13 @@ class SiloServer : public Poco::Util::ServerApplication {
const auto reference_genomes =
silo::ReferenceGenomes::readFromFile(preprocessing_config.getReferenceGenomeFilename());

SPDLOG_INFO("preprocessing - reading pango lineage alias");
const auto alias_lookup = silo::PangoLineageAliasLookup::readFromFile(
preprocessing_config.getPangoLineageDefinitionFilename()
);

auto preprocessor = silo::preprocessing::Preprocessor(
preprocessing_config, database_config, reference_genomes
preprocessing_config, database_config, reference_genomes, alias_lookup
);

return preprocessor.preprocess();