Skip to content

Commit

Permalink
feat: insertions no longer in metadata or databaseConfig, instead exp…
Browse files Browse the repository at this point in the history
…ected for all aligned sequences #372

BREAKING CHANGE: old database_config files might be invalid if they contained insertion columns. Also, we are more prohibitive for ndjson input files, which now MUST contain nucleotide/amino acid insertions for all respective sequences. The insertions action and filter do no longer require a column field.
  • Loading branch information
Taepper authored and fengelniederhammer committed May 23, 2024
1 parent decae59 commit 4e29220
Show file tree
Hide file tree
Showing 68 changed files with 1,012 additions and 1,178 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": ""
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": "CC+++"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": "CC..*"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"query": {
"action": {
"type": "Insertions",
"column": "nucleotideInsertions",
"sequenceName": "S"
},
"filterExpression": {
Expand Down
17 changes: 0 additions & 17 deletions endToEndTests/test/invalidQueries/insertionsInvalidColumn.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"query": {
"action": {
"type": "Insertions",
"column": "nucleotideInsertions",
"sequenceName": "notAValidSequence"
},
"filterExpression": {
Expand Down
7 changes: 1 addition & 6 deletions endToEndTests/test/queries/aaInsertionsContains.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"query": {
"action": {
"type": "Details",
"fields": ["aminoAcidInsertions", "gisaid_epi_isl"],
"fields": ["gisaid_epi_isl"],
"orderByFields": ["gisaid_epi_isl"]
},
"filterExpression": {
Expand All @@ -16,23 +16,18 @@
},
"expectedQueryResult": [
{
"aminoAcidInsertions": "S:214:EPE",
"gisaid_epi_isl": "EPI_ISL_1003629"
},
{
"aminoAcidInsertions": "S:214:EPE",
"gisaid_epi_isl": "EPI_ISL_1408408"
},
{
"aminoAcidInsertions": "S:214:EPE",
"gisaid_epi_isl": "EPI_ISL_1597932"
},
{
"aminoAcidInsertions": "S:214:EPE",
"gisaid_epi_isl": "EPI_ISL_1750868"
},
{
"aminoAcidInsertions": "S:214:EPE",
"gisaid_epi_isl": "EPI_ISL_581968"
}
]
Expand Down
1 change: 0 additions & 1 deletion endToEndTests/test/queries/insertionContains_exact.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": "CCC"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 22339,
"value": ".*GCT.*GGT.*"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 22204,
"value": "CAG.*AA"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": "TCAG.*AA"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 25701,
"value": "CC.*"
}
Expand Down
1 change: 0 additions & 1 deletion endToEndTests/test/queries/insertionsAction.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"query": {
"action": {
"type": "Insertions",
"column": "nucleotideInsertions",
"orderByFields": ["insertion"]
},
"filterExpression": {
Expand Down
3 changes: 1 addition & 2 deletions endToEndTests/test/queries/insertionsActionAndFilter.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
"query": {
"action": {
"type": "Insertions",
"column": "nucleotideInsertions"
"sequenceName": "main"
},
"filterExpression": {
"type": "InsertionContains",
"column": "nucleotideInsertions",
"position": 22339,
"value": ".*C.*G.*"
}
Expand Down
2 changes: 1 addition & 1 deletion include/silo/common/data_version.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class DataVersion {
uint32_t value;
};

static constexpr SerializationVersion CURRENT_SILO_SERIALIZATION_VERSION{0};
static constexpr SerializationVersion CURRENT_SILO_SERIALIZATION_VERSION{1};

private:
Timestamp timestamp;
Expand Down
53 changes: 53 additions & 0 deletions include/silo/common/table_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#pragma once

#include <filesystem>
#include <fstream>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>

#include <duckdb.hpp>

namespace silo {

struct ColumnFunction {
std::string column_name;
std::function<void(size_t, const duckdb::Value&)> function;
};

class TableReader {
private:
duckdb::Connection& connection;
std::string table_name;
std::string key_column;
std::vector<ColumnFunction> column_functions;
std::string where_clause;
std::string order_by_clause;
std::unique_ptr<duckdb::MaterializedQueryResult> query_result;
std::unique_ptr<duckdb::DataChunk> current_chunk;
size_t current_row;

std::optional<std::string> nextKey();

std::string getTableQuery();

void advanceRow();

public:
explicit TableReader(
duckdb::Connection& connection,
std::string_view table_name,
std::string_view key_column,
std::vector<ColumnFunction> column_functions,
std::string_view where_clause,
std::string_view order_by_clause
);

void read();

void loadTable();
};
} // namespace silo
14 changes: 2 additions & 12 deletions include/silo/config/database_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,8 @@

namespace silo::config {

enum class ValueType { STRING, PANGOLINEAGE, DATE, BOOL, INT, FLOAT, NUC_INSERTION, AA_INSERTION };
enum class ColumnType {
STRING,
INDEXED_STRING,
INDEXED_PANGOLINEAGE,
DATE,
BOOL,
INT,
FLOAT,
NUC_INSERTION,
AA_INSERTION
};
enum class ValueType { STRING, PANGOLINEAGE, DATE, BOOL, INT, FLOAT };
enum class ColumnType { STRING, INDEXED_STRING, INDEXED_PANGOLINEAGE, DATE, BOOL, INT, FLOAT };

ValueType toDatabaseValueType(std::string_view type);

Expand Down
12 changes: 9 additions & 3 deletions include/silo/config/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@ const std::string REFERENCE_GENOME_FILENAME_OPTION = "referenceGenomeFilename";
const std::string NUCLEOTIDE_SEQUENCE_PREFIX_OPTION = "nucleotideSequencePrefix";
const std::string UNALIGNED_NUCLEOTIDE_SEQUENCE_PREFIX_OPTION = "unalignedNucleotideSequencePrefix";
const std::string GENE_PREFIX_OPTION = "genePrefix";
const std::string NUCLEOTIDE_INSERTIONS_OPTION = "nucleotideInsertionsFilename";
const std::string AMINO_ACID_INSERTIONS_OPTION = "aminoAcidInsertionsFilename";

const std::string DEFAULT_OUTPUT_DIRECTORY = "./output/";

class PreprocessingConfig {
friend class fmt::formatter<silo::config::PreprocessingConfig>;

public:
std::filesystem::path input_directory = "./";
std::filesystem::path output_directory = DEFAULT_OUTPUT_DIRECTORY;
std::filesystem::path intermediate_results_directory = "./temp/";
Expand All @@ -40,9 +43,8 @@ class PreprocessingConfig {
std::string nucleotide_sequence_prefix = "nuc_";
std::string unaligned_nucleotide_sequence_prefix = "unaligned_";
std::string gene_prefix = "gene_";

public:
explicit PreprocessingConfig();
std::string nuc_insertions_filename = "nuc_insertions.tsv";
std::string aa_insertions_filename = "aa_insertions.tsv";

void validate() const;

Expand All @@ -67,6 +69,10 @@ class PreprocessingConfig {

[[nodiscard]] std::filesystem::path getGeneFilenameNoExtension(std::string_view gene_name) const;

[[nodiscard]] std::filesystem::path getNucleotideInsertionsFilename() const;

[[nodiscard]] std::filesystem::path getAminoAcidInsertionsFilename() const;

void overwrite(const silo::config::AbstractConfig& config_reader);
};

Expand Down
1 change: 0 additions & 1 deletion include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ class Database {
void initializeAASequences(
const std::map<std::string, std::vector<AminoAcid::Symbol>>& reference_sequences
);
void finalizeInsertionIndexes();

template <typename SymbolType>
static BitmapSizePerSymbol calculateBitmapSizePerSymbol(
Expand Down
6 changes: 6 additions & 0 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ class Preprocessor {
void buildPartitioningTableByColumn(const std::string& partition_by_field);
void buildEmptyPartitioning();

void createInsertionsTableFromFile(
const std::map<std::string, std::string>& expected_sequences,
const std::filesystem::path& insertion_file,
const std::string& table_name
);

void createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name);

void createAlignedPartitionedSequenceViews(
Expand Down
21 changes: 3 additions & 18 deletions include/silo/query_engine/actions/insertions.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,10 @@
namespace silo {
class Database;
namespace storage {
namespace column {
template <typename SymbolType>
class InsertionColumnPartition;
namespace insertion {
template <typename SymbolType>
class InsertionIndex;
} // namespace insertion
} // namespace column
} // namespace storage
} // namespace silo

Expand All @@ -41,27 +37,19 @@ class InsertionAggregation : public Action {
static constexpr std::string_view SEQUENCE_FIELD_NAME = "sequenceName";
static constexpr std::string_view COUNT_FIELD_NAME = "count";

std::vector<std::string> column_names;
std::vector<std::string> sequence_names;

struct PrefilteredBitmaps {
std::vector<std::pair<
const OperatorResult&,
const silo::storage::column::insertion::InsertionIndex<SymbolType>&>>
const silo::storage::insertion::InsertionIndex<SymbolType>&>>
bitmaps;
std::vector<std::pair<
const OperatorResult&,
const silo::storage::column::insertion::InsertionIndex<SymbolType>&>>
const silo::storage::insertion::InsertionIndex<SymbolType>&>>
full_bitmaps;
};

void addAllColumnIndexesToPreFilteredBitmaps(
const silo::storage::column::InsertionColumnPartition<SymbolType>& column,
const OperatorResult& filter,
std::unordered_map<std::string, InsertionAggregation<SymbolType>::PrefilteredBitmaps>&
bitmaps_to_evaluate
) const;

void addAggregatedInsertionsToInsertionCounts(
std::vector<QueryResultEntry>& output,
const std::string& sequence_name,
Expand All @@ -76,10 +64,7 @@ class InsertionAggregation : public Action {
) const;

public:
InsertionAggregation(
std::vector<std::string>&& column,
std::vector<std::string>&& sequence_names
);
InsertionAggregation(std::vector<std::string>&& sequence_names);

void validateOrderByFields(const Database& database) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,12 @@ namespace silo::query_engine::filter_expressions {
template <typename SymbolType>
class InsertionContains : public Expression {
private:
std::vector<std::string> column_names;
std::optional<std::string> sequence_name;
uint32_t position_idx;
std::string value;

public:
explicit InsertionContains(
std::vector<std::string>&& column_names,
std::optional<std::string> sequence_name,
uint32_t position_idx,
std::string value
Expand Down
Loading

0 comments on commit 4e29220

Please sign in to comment.