Skip to content

Commit

Permalink
fix: start with empty files without throwing an error
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed May 24, 2024
1 parent 8112812 commit ad268d5
Show file tree
Hide file tree
Showing 21 changed files with 452 additions and 403 deletions.
13 changes: 13 additions & 0 deletions include/silo/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,17 @@ std::string removeSymbol(const std::string& value, char symbol);

std::vector<std::string> slice(const std::vector<std::string>& elements, size_t start, size_t end);

std::vector<std::string> prepend(
const std::string& prefix,
const std::vector<std::string>& elements
);

std::vector<std::string> tie(
const std::string& prefix,
const std::vector<std::string>& elements1,
const std::string& delimiter,
const std::vector<std::string>& elements2,
const std::string& suffix
);

} // namespace silo
11 changes: 9 additions & 2 deletions include/silo/config/database_config.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <filesystem>
#include <functional>
#include <optional>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -31,8 +32,6 @@ class DatabaseSchema {
std::string primary_key;
std::optional<std::string> date_to_sort_by;
std::optional<std::string> partition_by;

[[nodiscard]] std::string getStrictOrderByClause() const;
};

class DatabaseConfig {
Expand All @@ -52,6 +51,14 @@ class DatabaseConfigReader {

} // namespace silo::config

template <>
struct std::less<silo::config::DatabaseMetadata> {
bool operator()(
const silo::config::DatabaseMetadata& lhs,
const silo::config::DatabaseMetadata& rhs
) const;
};

template <>
struct [[maybe_unused]] fmt::formatter<silo::config::DatabaseConfig> : fmt::formatter<std::string> {
[[maybe_unused]] static auto format(
Expand Down
25 changes: 14 additions & 11 deletions include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,37 @@

#include <filesystem>
#include <string>
#include <unordered_map>
#include <vector>

#include "silo/config/database_config.h"

namespace silo::preprocessing {

class PreprocessingDatabase;

class MetadataInfo {
std::unordered_map<std::string, std::string> metadata_selects;

MetadataInfo(std::unordered_map<std::string, std::string> metadata_selects);

public:
static MetadataInfo validateFromMetadataFile(
static void validateMetadataFile(
const std::filesystem::path& metadata_file,
const silo::config::DatabaseConfig& database_config
);

static MetadataInfo validateFromNdjsonFile(
static void validateNdjsonFile(
const std::filesystem::path& ndjson_file,
const silo::config::DatabaseConfig& database_config
);

std::vector<std::string> getMetadataFields() const;
static std::vector<std::string> getMetadataFields(
const silo::config::DatabaseConfig& database_config
);

std::vector<std::string> getMetadataSelects() const;
static std::vector<std::string> getMetadataTypes(
const silo::config::DatabaseConfig& database_config
);

static std::string getMetadataStruct(const silo::config::DatabaseConfig& database_config);

static std::vector<std::string> getMetadataSelects(
const silo::config::DatabaseConfig& database_config
);
};

} // namespace silo::preprocessing
19 changes: 6 additions & 13 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ namespace preprocessing {
class SequenceInfo;

class Preprocessor {
std::vector<std::string> nuc_sequences;
std::vector<std::string> aa_sequences;
std::vector<std::string> order_by_fields;
config::PreprocessingConfig preprocessing_config;
config::DatabaseConfig database_config;
PreprocessingDatabase preprocessing_db;
Expand All @@ -39,24 +42,15 @@ class Preprocessor {
void buildEmptyPartitioning();

void createInsertionsTableFromFile(
const std::map<std::string, std::string>& expected_sequences,
const std::vector<std::string>& expected_sequences,
const std::filesystem::path& insertion_file,
const std::string& table_name
);

void createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name);

void createAlignedPartitionedSequenceViews(
const std::filesystem::path& file_name,
const SequenceInfo& sequence_info,
const std::string& partition_by_select,
const std::string& partition_by_where
);
void createUnalignedPartitionedSequenceFiles(
const std::filesystem::path& file_name,
const std::string& partition_by_select,
const std::string& partition_by_where
);
void createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name);
void createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name);
void createUnalignedPartitionedSequenceFile(
const std::string& seq_name,
const std::string& table_sql
Expand All @@ -72,7 +66,6 @@ class Preprocessor {

Database buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const std::filesystem::path& intermediate_results_directory
);

Expand Down
22 changes: 11 additions & 11 deletions include/silo/preprocessing/sequence_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,32 @@ namespace preprocessing {
class PreprocessingDatabase;

class SequenceInfo {
std::vector<std::string> nuc_sequence_names;
std::vector<std::string> aa_sequence_names;

public:
explicit SequenceInfo(const silo::ReferenceGenomes& reference_genomes);

[[nodiscard]] std::vector<std::string> getAlignedSequenceSelects(
[[nodiscard]] static std::vector<std::string> getAlignedSequenceSelects(
const silo::ReferenceGenomes& reference_genomes,
const PreprocessingDatabase& preprocessing_db
) const;
);

static std::string getNucleotideSequenceSelect(
[[nodiscard]] static std::string getNucleotideSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

static std::string getUnalignedSequenceSelect(
[[nodiscard]] static std::string getUnalignedSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

static std::string getAminoAcidSequenceSelect(
[[nodiscard]] static std::string getAminoAcidSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

void validate(duckdb::Connection& connection, const std::filesystem::path& input_filename) const;
static void validateNdjsonFile(
const silo::ReferenceGenomes& reference_genomes,
duckdb::Connection& connection,
const std::filesystem::path& input_filename
);
};
} // namespace preprocessing
} // namespace silo
34 changes: 34 additions & 0 deletions src/silo/common/string_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#include "silo/common/string_utils.h"

#include <algorithm>
#include <cassert>
#include <stdexcept>

#include <fmt/format.h>

namespace silo {

std::vector<std::string> splitBy(const std::string& value, const std::string_view delimiter) {
Expand Down Expand Up @@ -40,4 +43,35 @@ std::vector<std::string> slice(const std::vector<std::string>& elements, size_t
}
return sliced_elements;
}

std::vector<std::string> prepend(
const std::string& prefix,
const std::vector<std::string>& elements
) {
std::vector<std::string> output;
output.reserve(elements.size());
for (const std::string& str : elements) {
output.emplace_back(prefix + str);
}
return output;
}

std::vector<std::string> tie(
const std::string& prefix,
const std::vector<std::string>& elements1,
const std::string& delimiter,
const std::vector<std::string>& elements2,
const std::string& suffix
) {
assert(elements1.size() == elements2.size());
std::vector<std::string> output;
output.reserve(elements1.size());
for (size_t i = 0; i < elements1.size(); ++i) {
output.emplace_back(
fmt::format("{}{}{}{}{}", prefix, elements1[i], delimiter, elements2[i], suffix)
);
}
return output;
}

} // namespace silo
17 changes: 7 additions & 10 deletions src/silo/config/database_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ std::string toString(ValueType type) {
}
} // namespace

bool std::less<silo::config::DatabaseMetadata>::operator()(
const silo::config::DatabaseMetadata& lhs,
const silo::config::DatabaseMetadata& rhs
) const {
return lhs.name < rhs.name;
}

namespace YAML {
template <>
struct convert<silo::config::DatabaseConfig> {
Expand Down Expand Up @@ -179,16 +186,6 @@ ColumnType DatabaseMetadata::getColumnType() const {
throw std::runtime_error("Did not find metadata with name: " + std::string(name));
}

std::string DatabaseSchema::getStrictOrderByClause() const {
if (date_to_sort_by.has_value()) {
SPDLOG_INFO("preprocessing - produce order by clause with a date to sort by");
return fmt::format("ORDER BY {}, {}", date_to_sort_by.value(), primary_key);
}

SPDLOG_INFO("preprocessing - produce order by clause without a date to sort by");
return fmt::format("ORDER BY {}", primary_key);
}

std::optional<DatabaseMetadata> DatabaseConfig::getMetadata(const std::string& name) const {
auto element = std::find_if(
std::begin(schema.metadata),
Expand Down
Loading

0 comments on commit ad268d5

Please sign in to comment.