Skip to content

Commit

Permalink
Code edits
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Dec 15, 2023
1 parent 2fea9b2 commit 7f57c4c
Show file tree
Hide file tree
Showing 11 changed files with 196 additions and 160 deletions.
2 changes: 1 addition & 1 deletion include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace preprocessing {
class PreprocessingDatabase;

class MetadataInfo {
a std::unordered_map<std::string, std::string> metadata_selects;
std::unordered_map<std::string, std::string> metadata_selects;

MetadataInfo(std::unordered_map<std::string, std::string> metadata_selects);

Expand Down
2 changes: 1 addition & 1 deletion include/silo/storage/database_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class DatabasePartition {

explicit DatabasePartition(std::vector<silo::preprocessing::PartitionChunk> chunks);

void validate();
void validate() const;

void flipBitmaps();

Expand Down
14 changes: 7 additions & 7 deletions src/silo/common/date.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ constexpr uint32_t BYTES_FOR_DAYS = 12;

silo::common::Date silo::common::stringToDate(const std::string& value) {
if (value.empty()) {
return 0;
return NULL_DATE;
}
auto split_position = value.find('-', 0);
if (split_position == std::string::npos) {
SPDLOG_WARN("Expect dates to be delimited by '-': " + value + "\nIgnoring date");
return 0;
return NULL_DATE;
}
auto split_position2 = value.find('-', split_position + 1);
if (split_position2 == std::string::npos) {
SPDLOG_WARN("Expect dates to be delimited twice by '-': " + value + "\nIgnoring date");
return 0;
return NULL_DATE;
}
const std::string year_string = value.substr(0, split_position);
const std::string month_string = value.substr(split_position + 1, split_position2);
Expand All @@ -40,11 +40,11 @@ silo::common::Date silo::common::stringToDate(const std::string& value) {
const uint32_t day = stoi(day_string);
if (month > NUMBER_OF_MONTHS || month == 0) {
SPDLOG_WARN("Month is not in [1,{}]: {} \nIgnoring date", NUMBER_OF_MONTHS, value);
return 0;
return NULL_DATE;
}
if (day > NUMBER_OF_DAYS || day == 0) {
SPDLOG_WARN("Month is not in [1,{}]: {} \nIgnoring date", NUMBER_OF_DAYS, value);
return 0;
return NULL_DATE;
}
// Date is stored with the year in the upper 16 bits, month in bits [12,16), and day [0,12)
const uint32_t date_value =
Expand All @@ -54,12 +54,12 @@ silo::common::Date silo::common::stringToDate(const std::string& value) {
SPDLOG_WARN(
"Parsing of date failed: " + value + "\nWith exception: " + ex.what() + "\nIgnoring date"
);
return 0;
return NULL_DATE;
} catch (const std::out_of_range& ex) {
SPDLOG_WARN(
"Parsing of date failed: " + value + "\nWith exception: " + ex.what() + "\nIgnoring date"
);
return 0;
return NULL_DATE;
}
}

Expand Down
109 changes: 1 addition & 108 deletions src/silo/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,115 +109,8 @@ const std::map<std::string, SequenceStore<AminoAcid>>& Database::getSequenceStor
}

void Database::validate() const {
std::vector<size_t> partition_sizes;
for (const auto& partition : partitions) {
size_t partition_size = partition.sequence_count;
partition_sizes.push_back(partition_size);

for (const auto& [name, nuc_store] : partition.nuc_sequences) {
if (nuc_store.sequence_count != partition_size) {
throw preprocessing::PreprocessingException(fmt::format(
"nuc_store {} ({}) has invalid size (expected {}).",
name,
nuc_store.sequence_count,
partition_size
));
}
if (nuc_store.positions.size() != nuc_store.reference_sequence.size()) {
throw preprocessing::PreprocessingException(fmt::format(
"nuc_store positions {} ({}) has size unequal to reference (expected {}).",
name,
nuc_store.positions.size(),
nuc_store.reference_sequence.size()
));
}
if (nuc_store.reference_sequence.size() == 0) {
throw preprocessing::PreprocessingException(
"reference_sequence " + name + " is empty."
);
}
if (nuc_store.missing_symbol_bitmaps.size() != partition_size) {
throw preprocessing::PreprocessingException(
"nuc_store.missing_symbol_bitmaps " + name + " has invalid size."
);
}
}

for (const auto& [name, aa_store] : partition.aa_sequences) {
if (aa_store.sequence_count != partition_size) {
throw preprocessing::PreprocessingException(fmt::format(
"aa_store {} ({}) has invalid size (expected {}).",
name,
aa_store.sequence_count,
partition_size
));
}
if (aa_store.positions.size() != aa_store.reference_sequence.size()) {
throw preprocessing::PreprocessingException(
"aa_store " + name + " has invalid position size."
);
}
if (aa_store.reference_sequence.size() == 0) {
throw preprocessing::PreprocessingException(
"reference_sequence " + name + " is empty."
);
}
if (aa_store.missing_symbol_bitmaps.size() != partition_size) {
throw preprocessing::PreprocessingException(
"aa_store.missing_symbol_bitmaps " + name + " has invalid size."
);
}
}

for (const auto& col : partition.columns.aa_insertion_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"aa_insertion_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.nuc_insertion_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"nuc_insertion_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.date_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"date_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.int_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"int_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.indexed_string_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"indexed_string_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.string_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"string_columns " + col.first + " has invalid size."
);
}
}
for (const auto& col : partition.columns.float_columns) {
if (col.second.getValues().size() != partition_size) {
throw preprocessing::PreprocessingException(
"float_columns " + col.first + " has invalid size."
);
}
}
partition.validate();
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/silo/preprocessing/metadata_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ std::unordered_map<std::string, std::string> validateFieldsAgainstConfig(
if (std::find(config_metadata_fields.begin(), config_metadata_fields.end(), field_name)
!= config_metadata_fields.end()) {
validated_metadata_fields.emplace(field_name, access_path);
} else {
SPDLOG_WARN(
"Metadata field {} ({}), which is contained in the file is not contained in the "
"config.",
field_name,
access_path
);
}
}
for (const std::string& name : config_metadata_fields) {
Expand Down
65 changes: 60 additions & 5 deletions src/silo/preprocessing/metadata_info.test.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#include "silo/preprocessing/metadata_info.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "silo/config/config_repository.h"
#include "silo/preprocessing/metadata.h"
#include "silo/preprocessing/preprocessing_exception.h"

TEST(
Expand Down Expand Up @@ -46,7 +44,64 @@ TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) {
"gisaid_epi_isl",
}};

EXPECT_NO_THROW(silo::preprocessing::MetadataInfo::validateFromMetadataFile(
"testBaseData/exampleDataset/small_metadata_set.tsv", valid_config
));
silo::preprocessing::MetadataInfo fields =
silo::preprocessing::MetadataInfo::validateFromMetadataFile(
"testBaseData/exampleDataset/small_metadata_set.tsv", valid_config
);
ASSERT_TRUE(
std::find(
fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "gisaid_epi_isl"
) != fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(
fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "pango_lineage"
) != fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "date") !=
fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "country") !=
fields.getMetadataFields().end()
);
}

TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) {
const silo::config::DatabaseConfig valid_config{
"main",
{
"testInstanceName",
{
{"gisaid_epi_isl", silo::config::ValueType::STRING},
{"pango_lineage", silo::config::ValueType::PANGOLINEAGE},
{"date", silo::config::ValueType::DATE},
{"country", silo::config::ValueType::STRING},
},
"gisaid_epi_isl",
}};

silo::preprocessing::MetadataInfo fields =
silo::preprocessing::MetadataInfo::validateFromNdjsonFile(
"testBaseData/exampleDatasetAsNdjson/input_file.ndjson", valid_config
);
ASSERT_TRUE(
std::find(
fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "gisaid_epi_isl"
) != fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(
fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "pango_lineage"
) != fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "date") !=
fields.getMetadataFields().end()
);
ASSERT_TRUE(
std::find(fields.getMetadataFields().begin(), fields.getMetadataFields().end(), "country") !=
fields.getMetadataFields().end()
);
}
8 changes: 0 additions & 8 deletions src/silo/preprocessing/preprocessing_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,6 @@

#include "silo/preprocessing/partition.h"

namespace {

std::string buildChunkString(uint32_t partition, uint32_t chunk) {
return "P" + std::to_string(partition) + "_C" + std::to_string(chunk);
}

} // namespace

namespace silo::preprocessing {

std::filesystem::path createOutputPath(
Expand Down
24 changes: 0 additions & 24 deletions src/silo/preprocessing/preprocessing_config_reader.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,6 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithCorrectParametersAndDefaults
ASSERT_EQ(
config.getPangoLineageDefinitionFilename(), input_directory + "pangolineage_alias.json"
);
ASSERT_EQ(
config.getNucPartitionFilename("dummy", 0, 0),
intermediate_directory + "partitions/nuc_dummy/P0_C0.zstdfasta"
);
ASSERT_EQ(
config.getGenePartitionFilename("dummy2", 0, 0),
intermediate_directory + "partitions/gene_dummy2/P0_C0.zstdfasta"
);
ASSERT_EQ(
config.getNucSortedPartitionFilename("dummy", 2, 1),
intermediate_directory + "partitions_sorted/nuc_dummy/P2_C1.zstdfasta"
);
ASSERT_EQ(
config.getGeneSortedPartitionFilename("dummy", 2, 1),
intermediate_directory + "partitions_sorted/gene_dummy/P2_C1.zstdfasta"
);
}

TEST(PreprocessingConfigReader, shouldThrowExceptionWhenConfigFileDoesNotExist) {
Expand Down Expand Up @@ -67,14 +51,6 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithOverriddenDefaults) {
);

ASSERT_EQ(config.getNucFilenameNoExtension("aligned"), input_directory + "aligned");
ASSERT_EQ(
config.getNucPartitionFilename("aligned", 0, 1),
intermediate_directory + "folder1/aligned/P0_C1.zstdfasta"
);
ASSERT_EQ(
config.getNucSortedPartitionFilename("aligned", 2, 3),
intermediate_directory + "folder2/aligned/P2_C3.zstdfasta"
);
ASSERT_EQ(config.getOutputDirectory(), "./output/custom/");
}

Expand Down
2 changes: 1 addition & 1 deletion src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void Preprocessor::buildTablesFromNdjsonInput(

const auto metadata_info = MetadataInfo::validateFromNdjsonFile(file_name, database_config);

preprocessing_db.registerSequences(reference_genomes);
PreprocessingDatabase::registerSequences(reference_genomes);

(void)preprocessing_db.query(fmt::format(
"CREATE OR REPLACE TABLE preprocessing_table AS SELECT {}, "
Expand Down
10 changes: 5 additions & 5 deletions src/silo/storage/column_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,35 +104,35 @@ void ColumnPartitionGroup::addValueToColumn(
break;
case silo::config::ColumnType::DATE:
if (value.IsNull()) {
date_columns.at(column_name).insert(common::stringToDate(""));
date_columns.at(column_name).insertNull();
} else {
date_columns.at(column_name).insert(common::stringToDate(value.ToString()));
}
break;
case silo::config::ColumnType::INT:
if (value.IsNull()) {
int_columns.at(column_name).insert("");
int_columns.at(column_name).insertNull();
} else {
int_columns.at(column_name).insert(value.ToString());
}
break;
case silo::config::ColumnType::FLOAT:
if (value.IsNull()) {
float_columns.at(column_name).insert("");
float_columns.at(column_name).insertNull();
} else {
float_columns.at(column_name).insert(value.ToString());
}
break;
case silo::config::ColumnType::NUC_INSERTION:
if (value.IsNull()) {
nuc_insertion_columns.at(column_name).insert("");
nuc_insertion_columns.at(column_name).insertNull();
} else {
nuc_insertion_columns.at(column_name).insert(value.ToString());
}
break;
case silo::config::ColumnType::AA_INSERTION:
if (value.IsNull()) {
aa_insertion_columns.at(column_name).insert("");
aa_insertion_columns.at(column_name).insertNull();
} else {
aa_insertion_columns.at(column_name).insert(value.ToString());
}
Expand Down
Loading

0 comments on commit 7f57c4c

Please sign in to comment.