From 344ec7b20a3d6727e8972334542a610260cdc782 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Fri, 14 Jun 2024 09:13:57 +0200 Subject: [PATCH 1/3] fix: more efficient ndjson emptiness check (#481) --- src/silo/preprocessing/metadata_info.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/silo/preprocessing/metadata_info.cpp b/src/silo/preprocessing/metadata_info.cpp index e2993ac43..cd78235cc 100644 --- a/src/silo/preprocessing/metadata_info.cpp +++ b/src/silo/preprocessing/metadata_info.cpp @@ -91,7 +91,7 @@ bool MetadataInfo::isNdjsonFileEmpty(const std::filesystem::path& ndjson_file) { auto result = connection.Query(fmt::format( "SELECT COUNT(*) " - "FROM read_json_auto(\"{}\");", + "FROM (SELECT * FROM read_json_auto(\"{}\") LIMIT 1);", ndjson_file.string() )); From 4d7b25d4f2d43f1e01f58b5031c908020b1625ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:07:52 +0200 Subject: [PATCH 2/3] chore(main): release 0.2.4 (#479) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .release-please-manifest.json | 2 +- CHANGELOG.md | 12 ++++++++++++ version.txt | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index c4302872b..cac31681a 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.2.3" + ".": "0.2.4" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e806adaa..9887cb945 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [0.2.4](https://github.com/GenSpectrum/LAPIS-SILO/compare/v0.2.3...v0.2.4) (2024-06-14) + + +### Features + +* allow null for sequenceName in insertion contains queries ([6dbe251](https://github.com/GenSpectrum/LAPIS-SILO/commit/6dbe251a03e8a317188de0292f0e637b8ec4c24d)) + + +### Bug Fixes + +* more efficient ndjson emptiness check ([#481](https://github.com/GenSpectrum/LAPIS-SILO/issues/481)) ([344ec7b](https://github.com/GenSpectrum/LAPIS-SILO/commit/344ec7b20a3d6727e8972334542a610260cdc782)) + ## [0.2.3](https://github.com/GenSpectrum/LAPIS-SILO/compare/v0.2.2...v0.2.3) (2024-06-10) diff --git a/version.txt b/version.txt index 717903969..abd410582 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.2.3 +0.2.4 From 9b65e941b3529e785d4c8a7512d2811821a0dd22 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Mon, 17 Jun 2024 09:33:14 +0200 Subject: [PATCH 3/3] refactor: lambda-callbacks instead of fill columns (#482) instead of the columns pulling the next row from the file_reader, the file_reader instead pushes the value to the columns using callbacks. This is in preparation for parallelizing the building of column functions --- include/silo/common/table_reader.h | 24 ++++++++---- include/silo/storage/column_group.h | 7 ---- src/silo/common/table_reader.cpp | 10 ++++- src/silo/preprocessing/preprocessor.cpp | 32 ++++++++++++---- src/silo/storage/column_group.cpp | 51 ------------------------- 5 files changed, 49 insertions(+), 75 deletions(-) diff --git a/include/silo/common/table_reader.h b/include/silo/common/table_reader.h index 8b6d525a4..12763b78a 100644 --- a/include/silo/common/table_reader.h +++ b/include/silo/common/table_reader.h @@ -13,9 +13,16 @@ namespace silo { -struct ColumnFunction { +class ColumnFunction { + friend class TableReader; std::string column_name; std::function function; + + public: + ColumnFunction( + std::string column_name, + std::function function + ); }; class TableReader { @@ -31,12 +38,6 @@ class TableReader { size_t current_row; size_t current_row_in_chunk; - std::optional nextKey(); - - std::string getTableQuery(); - - void advanceRow(); - public: explicit TableReader( duckdb::Connection& connection, @@ -47,7 +48,14 @@ class TableReader { std::string_view order_by_clause ); - void read(); + size_t read(); + + private: + std::optional nextKey(); + + std::string getTableQuery(); + + void advanceRow(); void loadTable(); }; diff --git a/include/silo/storage/column_group.h b/include/silo/storage/column_group.h index 41e547152..dc3c17640 100644 --- a/include/silo/storage/column_group.h +++ b/include/silo/storage/column_group.h @@ -81,13 +81,6 @@ class ColumnPartitionGroup { std::map date_columns; std::map pango_lineage_columns; - uint32_t fill( - duckdb::Connection& connection, - uint32_t partition_id, - const std::string& order_by_clause, - const silo::config::DatabaseConfig& database_config - ); - void addValueToColumn( const std::string& column_name, config::ColumnType column_type, diff --git a/src/silo/common/table_reader.cpp b/src/silo/common/table_reader.cpp index 9e73c88eb..9c4848947 100644 --- a/src/silo/common/table_reader.cpp +++ b/src/silo/common/table_reader.cpp @@ -11,6 +11,13 @@ #include "silo/preprocessing/preprocessing_exception.h" +silo::ColumnFunction::ColumnFunction( + std::string column_name, + std::function function +) + : column_name(std::move(column_name)), + function(std::move(function)) {} + silo::TableReader::TableReader( duckdb::Connection& connection, std::string_view table_name, @@ -34,7 +41,7 @@ std::optional silo::TableReader::nextKey() { return current_chunk->GetValue(0, current_row_in_chunk).GetValue(); } -void silo::TableReader::read() { +size_t silo::TableReader::read() { loadTable(); assert(query_result->ColumnCount() == column_functions.size() + 1); while (nextKey()) { @@ -44,6 +51,7 @@ void silo::TableReader::read() { } advanceRow(); } + return current_row; } std::string silo::TableReader::getTableQuery() { diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index 953286ce1..99a5ed40f 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -651,15 +651,31 @@ void Preprocessor::buildMetadataStore( ) { for (size_t partition_id = 0; partition_id < partition_descriptor.getPartitions().size(); ++partition_id) { - const auto& part = partition_descriptor.getPartitions()[partition_id]; - for (size_t chunk_index = 0; chunk_index < part.getPartitionChunks().size(); ++chunk_index) { - const uint32_t sequences_added = - database.partitions.at(partition_id) - .columns.fill( - preprocessing_db.getConnection(), partition_id, order_by_clause, database_config - ); - database.partitions.at(partition_id).sequence_count += sequences_added; + auto& column_group = database.partitions.at(partition_id).columns; + std::vector column_functions; + column_functions.reserve(database_config.schema.metadata.size()); + for (auto& item : database_config.schema.metadata) { + column_functions.emplace_back( + item.name, + [&](size_t /*row_idx*/, const duckdb::Value& value) { + if (value.IsNull()) { + column_group.addNullToColumn(item.name, item.getColumnType()); + } else { + column_group.addValueToColumn(item.name, item.getColumnType(), value); + } + } + ); } + TableReader table_reader( + preprocessing_db.getConnection(), + "partitioned_metadata", + database_config.schema.primary_key, + column_functions, + fmt::format("partition_id = {}", partition_id), + order_by_clause + ); + const size_t number_of_rows = table_reader.read(); + database.partitions.at(partition_id).sequence_count += number_of_rows; SPDLOG_INFO("build - finished columns for partition {}", partition_id); } } diff --git a/src/silo/storage/column_group.cpp b/src/silo/storage/column_group.cpp index d377db5a3..9bb08e424 100644 --- a/src/silo/storage/column_group.cpp +++ b/src/silo/storage/column_group.cpp @@ -26,57 +26,6 @@ using silo::config::ColumnType; using silo::common::OptionalBool; -uint32_t ColumnPartitionGroup::fill( - duckdb::Connection& connection, - uint32_t partition_id, - const std::string& order_by_clause, - const silo::config::DatabaseConfig& database_config -) { - uint32_t sequence_count = 0; - - std::vector column_names; - column_names.reserve(database_config.schema.metadata.size()); - for (const auto& item : database_config.schema.metadata) { - column_names.push_back("\"" + item.name + "\""); - } - std::string column_name_sql = boost::algorithm::join(column_names, ", "); - - auto result = connection.Query(fmt::format( - "SELECT {} FROM partitioned_metadata WHERE partition_id = {} {}", - column_name_sql, - partition_id, - order_by_clause - )); - if (result->HasError()) { - throw preprocessing::PreprocessingException( - "Error in the execution of the duckdb statement for partition key table " - "generation: " + - result->GetError() - ); - } - const size_t row_count = result->RowCount(); - for (const auto& item : database_config.schema.metadata) { - const auto column_type = item.getColumnType(); - reserveSpaceInColumn(item.name, column_type, row_count); - } - - for (auto it = result->begin(); it != result->end(); ++it) { - size_t column_index = 0; - for (const auto& item : database_config.schema.metadata) { - const auto column_type = item.getColumnType(); - const auto value = it.current_row.GetValue(column_index++); - addValueToColumn(item.name, column_type, value); - } - if (++sequence_count == UINT32_MAX) { - throw std::runtime_error( - "SILO is currently limited to UINT32_MAX=" + std::to_string(UINT32_MAX) + " sequences." - ); - } - } - - return sequence_count; -} - void ColumnPartitionGroup::addValueToColumn( const std::string& column_name, ColumnType column_type,