From 7c0fedca84d276172bb4c4f671ddcc2d5c419f5f Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 30 Sep 2024 14:11:55 +0200 Subject: [PATCH 01/20] wip on globbing --- data/csv/glob_dif_dialect/14166/__2000.csv | 3 ++ data/csv/glob_dif_dialect/14166/__2001.csv | 1 + .../buffer_manager/csv_buffer_manager.cpp | 8 ++-- .../csv_scanner/scanner/base_scanner.cpp | 2 +- .../csv_scanner/scanner/csv_schema.cpp | 42 ++++++++++++++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 45 ++++++++++--------- .../csv_scanner/sniffer/dialect_detection.cpp | 2 +- .../csv_scanner/sniffer/header_detection.cpp | 2 +- .../csv_scanner/sniffer/type_detection.cpp | 2 +- .../csv_scanner/sniffer/type_refinement.cpp | 2 +- .../csv_scanner/sniffer/type_replacement.cpp | 2 +- .../state_machine/csv_state_machine.cpp | 2 +- .../state_machine/csv_state_machine_cache.cpp | 2 +- .../table_function/csv_file_scanner.cpp | 2 +- .../table_function/global_csv_state.cpp | 2 +- src/function/table/copy_csv.cpp | 2 +- src/function/table/read_csv.cpp | 2 +- src/function/table/sniff_csv.cpp | 2 +- .../csv_scanner/csv_buffer_manager.hpp | 8 ++-- .../operator/csv_scanner/csv_schema.hpp | 5 ++- .../csv_scanner/{ => sniffer}/csv_sniffer.hpp | 17 ++----- .../csv_scanner/sniffer/sniff_result.hpp | 34 ++++++++++++++ src/main/relation/read_csv_relation.cpp | 2 +- test/sql/copy/csv/glob/read_csv_glob.test | 11 +++++ 24 files changed, 138 insertions(+), 64 deletions(-) create mode 100644 data/csv/glob_dif_dialect/14166/__2000.csv create mode 100644 data/csv/glob_dif_dialect/14166/__2001.csv rename src/include/duckdb/execution/operator/csv_scanner/{ => sniffer}/csv_sniffer.hpp (95%) create mode 100644 src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp diff --git a/data/csv/glob_dif_dialect/14166/__2000.csv b/data/csv/glob_dif_dialect/14166/__2000.csv new file mode 100644 index 00000000000..a6885b98e8f --- /dev/null +++ b/data/csv/glob_dif_dialect/14166/__2000.csv @@ -0,0 +1,3 @@ +date_col,int_col,double_col +2000-01-01,10,80.9189441112103 +2000-01-02,5,109.16581782022259 diff --git a/data/csv/glob_dif_dialect/14166/__2001.csv b/data/csv/glob_dif_dialect/14166/__2001.csv new file mode 100644 index 00000000000..bb6ef187f95 --- /dev/null +++ b/data/csv/glob_dif_dialect/14166/__2001.csv @@ -0,0 +1 @@ +date_col,int_col,double_col diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp index c8a7d167678..064595f3c43 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp @@ -119,15 +119,15 @@ void CSVBufferManager::ResetBuffer(const idx_t buffer_idx) { } } -idx_t CSVBufferManager::GetBufferSize() { +idx_t CSVBufferManager::GetBufferSize() const { return buffer_size; } -idx_t CSVBufferManager::BufferCount() { +idx_t CSVBufferManager::BufferCount() const { return cached_buffers.size(); } -bool CSVBufferManager::Done() { +bool CSVBufferManager::Done() const { return done; } @@ -144,7 +144,7 @@ void CSVBufferManager::ResetBufferManager() { } } -string CSVBufferManager::GetFilePath() { +string CSVBufferManager::GetFilePath() const { return file_path; } diff --git a/src/execution/operator/csv_scanner/scanner/base_scanner.cpp b/src/execution/operator/csv_scanner/scanner/base_scanner.cpp index 63e93edaee2..757598e140e 100644 --- a/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/base_scanner.cpp @@ -1,6 +1,6 @@ #include "duckdb/execution/operator/csv_scanner/base_scanner.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp" namespace duckdb { diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index 5d6a9b0d2ee..1eb0b218974 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -60,14 +60,46 @@ bool CSVSchema::Empty() const { return columns.empty(); } -bool CSVSchema::SchemasMatch(string &error_message, vector &names, vector &types, - const string &cur_file_path) { - D_ASSERT(names.size() == types.size()); +bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_result, + const string &cur_file_path, bool is_minimal_sniffer) const{ + D_ASSERT(sniffer_result->names.size() == sniffer_result->return_types.size()); bool match = true; unordered_map current_schema; - for (idx_t i = 0; i < names.size(); i++) { + + for (idx_t i = 0; i < sniffer_result->names.size(); i++) { // Populate our little schema - current_schema[names[i]] = {types[i], i}; + current_schema[sniffer_result->names[i]] = {sniffer_result->return_types[i], i}; + } + if (is_minimal_sniffer) { + auto min_sniffer = static_cast(sniffer_result); + if (!min_sniffer->more_than_one_row) { + bool min_sniff_match = true; + // If we don't have more than one row, either the names must match or the types must match. + for (auto &column : columns) { + if (current_schema.find(column.name) == current_schema.end()) { + min_sniff_match = false; + break; + } + } + if (min_sniff_match) { + return true; + } + // Otherwise, the types must match. + min_sniff_match = true; + if (sniffer_result->return_types.size() == current_schema.size()) { + idx_t return_type_idx = 0; + for (auto &column : current_schema) { + if (column.second.type != sniffer_result->return_types[return_type_idx++]) { + min_sniff_match = false; + break; + } + } + } + if (min_sniff_match) { + return true; + } + } + // If we got to this point, the minimal sniffer doesn't match, we throw an error. } // Here we check if the schema of a given file matched our original schema // We consider it's not a match if: diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index baefcff45fa..935aa27136b 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -1,4 +1,4 @@ -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/common/types/value.hpp" namespace duckdb { @@ -88,15 +88,14 @@ void CSVSniffer::SetResultOptions() { options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header; } -SnifferResult CSVSniffer::MinimalSniff() { +AdaptiveSnifferResult CSVSniffer::MinimalSniff() { if (set_columns.IsSet()) { // Nothing to see here - return SnifferResult(*set_columns.types, *set_columns.names); + return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true); } // Return Types detected vector return_types; // Column Names detected - vector names; buffer_manager->sniffing = true; constexpr idx_t result_size = 2; @@ -106,7 +105,8 @@ SnifferResult CSVSniffer::MinimalSniff() { ColumnCountScanner count_scanner(buffer_manager, state_machine, error_handler, result_size); auto &sniffed_column_counts = count_scanner.ParseChunk(); if (sniffed_column_counts.result_position == 0) { - return {{}, {}}; + // The file is an empty file, we just return + return {{}, {}, false}; } state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns; @@ -130,20 +130,20 @@ SnifferResult CSVSniffer::MinimalSniff() { // Possibly Gather Header vector potential_header; - if (start_row != 0) { - for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) { - auto &cur_vector = data_chunk.data[col_idx]; - auto vector_data = FlatVector::GetData(cur_vector); - auto &validity = FlatVector::Validity(cur_vector); - HeaderValue val; - if (validity.RowIsValid(0)) { - val = HeaderValue(vector_data[0]); - } - potential_header.emplace_back(val); + + for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) { + auto &cur_vector = data_chunk.data[col_idx]; + auto vector_data = FlatVector::GetData(cur_vector); + auto &validity = FlatVector::Validity(cur_vector); + HeaderValue val; + if (validity.RowIsValid(0)) { + val = HeaderValue(vector_data[0]); } + potential_header.emplace_back(val); } - names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns, - best_sql_types_candidates_per_column_idx, options, *error_handler); + + vector names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns, + best_sql_types_candidates_per_column_idx, options, *error_handler); for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) { LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back(); @@ -153,10 +153,11 @@ SnifferResult CSVSniffer::MinimalSniff() { detected_types.push_back(d_type); } - return {detected_types, names}; + return {detected_types, names, sniffed_column_counts.result_position > 2}; } -SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) { + +SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { auto min_sniff_res = MinimalSniff(); bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors(); // Check if we are happy with the result or if we need to do more sniffing @@ -165,7 +166,7 @@ SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) { if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; run_full = - !file_schema.SchemasMatch(error, min_sniff_res.names, min_sniff_res.return_types, options.file_path); + !file_schema.SchemasMatch(error, &min_sniff_res, options.file_path, true); } } if (run_full) { @@ -173,14 +174,14 @@ SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) { auto full_sniffer = SniffCSV(); if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; - if (!file_schema.SchemasMatch(error, full_sniffer.names, full_sniffer.return_types, options.file_path) && + if (!file_schema.SchemasMatch(error, &full_sniffer, options.file_path, false) && !options.ignore_errors.GetValue()) { throw InvalidInputException(error); } } return full_sniffer; } - return min_sniff_res; + return static_cast(min_sniff_res); } SnifferResult CSVSniffer::SniffCSV(bool force_match) { buffer_manager->sniffing = true; diff --git a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index bf142a93010..c335f03ce74 100644 --- a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -1,5 +1,5 @@ #include "duckdb/common/shared_ptr.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/main/client_data.hpp" #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp" diff --git a/src/execution/operator/csv_scanner/sniffer/header_detection.cpp b/src/execution/operator/csv_scanner/sniffer/header_detection.cpp index cf42f56dd3a..9475f5942da 100644 --- a/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/header_detection.cpp @@ -1,5 +1,5 @@ #include "duckdb/common/types/cast_helpers.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp" #include "utf8proc.hpp" diff --git a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index df2a9540edd..89347630fae 100644 --- a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -4,7 +4,7 @@ #include "duckdb/common/operator/integer_cast_operator.hpp" #include "duckdb/common/string.hpp" #include "duckdb/common/types/time.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" namespace duckdb { struct TryCastFloatingOperator { diff --git a/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp b/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp index 43d69318307..8d3e268450c 100644 --- a/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +++ b/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp @@ -1,4 +1,4 @@ -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/csv_casting.hpp" namespace duckdb { diff --git a/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp b/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp index 34fa41469b3..a693144decc 100644 --- a/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +++ b/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp @@ -1,4 +1,4 @@ -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" namespace duckdb { void CSVSniffer::ReplaceTypes() { diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp index 665c5b392f2..eae140f7d36 100644 --- a/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp +++ b/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp @@ -1,5 +1,5 @@ #include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "utf8proc_wrapper.hpp" #include "duckdb/main/error_manager.hpp" #include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp" diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp index 6c93cc939f3..929892c3b7b 100644 --- a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +++ b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp @@ -1,6 +1,6 @@ #include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp" #include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" namespace duckdb { diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index e35894861d3..3e457580c59 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -1,6 +1,6 @@ #include "duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp" #include "duckdb/function/table/read_csv.hpp" diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 4f3e9dce824..cefb134138c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -1,6 +1,6 @@ #include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/scanner_boundary.hpp" #include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp" #include "duckdb/execution/operator/persistent/csv_rejects_table.hpp" diff --git a/src/function/table/copy_csv.cpp b/src/function/table/copy_csv.cpp index b2c16a67172..26d04f89df7 100644 --- a/src/function/table/copy_csv.cpp +++ b/src/function/table/copy_csv.cpp @@ -7,7 +7,7 @@ #include "duckdb/common/types/column/column_data_collection.hpp" #include "duckdb/common/types/string_type.hpp" #include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/function/copy_function.hpp" #include "duckdb/function/scalar/string_functions.hpp" #include "duckdb/function/table/read_csv.hpp" diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 58948af7f03..02e571b0552 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -8,7 +8,7 @@ #include "duckdb/common/union_by_name.hpp" #include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp" #include "duckdb/execution/operator/csv_scanner/csv_error.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/persistent/csv_rejects_table.hpp" #include "duckdb/function/function_set.hpp" #include "duckdb/main/client_context.hpp" diff --git a/src/function/table/sniff_csv.cpp b/src/function/table/sniff_csv.cpp index 2a1d1b98437..ae38e5587a9 100644 --- a/src/function/table/sniff_csv.cpp +++ b/src/function/table/sniff_csv.cpp @@ -1,7 +1,7 @@ #include "duckdb/function/built_in_functions.hpp" #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp" #include "duckdb/common/types/data_chunk.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp" #include "duckdb/function/table_function.hpp" #include "duckdb/main/client_context.hpp" diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp index 396c60c9ba2..5bcf0af6511 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp @@ -35,15 +35,15 @@ class CSVBufferManager { void UnpinBuffer(const idx_t cache_idx); //! Returns the buffer size set for this CSV buffer manager - idx_t GetBufferSize(); + idx_t GetBufferSize() const; //! Returns the number of buffers in the cached_buffers cache - idx_t BufferCount(); + idx_t BufferCount() const; //! If this buffer manager is done. In the context of a buffer manager it means that it read all buffers at least //! once. - bool Done(); + bool Done() const; void ResetBufferManager(); - string GetFilePath(); + string GetFilePath() const; ClientContext &context; idx_t skip_rows = 0; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp index cf56f75f9ad..e2693517e2c 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp @@ -9,6 +9,7 @@ #pragma once #include "duckdb/common/types.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp" namespace duckdb { //! Basic CSV Column Info @@ -23,8 +24,8 @@ struct CSVColumnInfo { struct CSVSchema { void Initialize(vector &names, vector &types, const string &file_path); bool Empty() const; - bool SchemasMatch(string &error_message, vector &names, vector &types, - const string &cur_file_path); + bool SchemasMatch(string &error_message, SnifferResult *sniffer_resilt, + const string &cur_file_path, bool is_minimal_sniffer) const; private: static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination); diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp similarity index 95% rename from src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp rename to src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp index fa463788e36..6de097deaa1 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp @@ -1,7 +1,7 @@ //===----------------------------------------------------------------------===// // DuckDB // -// duckdb/execution/operator/csv_scanner/csv_sniffer.hpp +// duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp // // //===----------------------------------------------------------------------===// @@ -14,6 +14,7 @@ #include "duckdb/execution/operator/csv_scanner/column_count_scanner.hpp" #include "duckdb/execution/operator/csv_scanner/csv_schema.hpp" #include "duckdb/execution/operator/csv_scanner/header_value.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp" namespace duckdb { struct DateTimestampSniffing { @@ -22,16 +23,6 @@ struct DateTimestampSniffing { vector format; idx_t initial_size; }; -//! Struct to store the result of the Sniffer -struct SnifferResult { - SnifferResult(vector return_types_p, vector names_p) - : return_types(std::move(return_types_p)), names(std::move(names_p)) { - } - //! Return Types that were detected - vector return_types; - //! Column Names that were detected - vector names; -}; //! All the options that will be used to sniff the dialect of the CSV file struct DialectCandidates { @@ -134,10 +125,10 @@ class CSVSniffer { //! data types It does this considering a priorly set CSV schema. If there is a mismatch of the schema it runs the //! full on blazing all guns sniffer, if that still fails it tells the user to union_by_name. //! It returns the projection order. - SnifferResult AdaptiveSniff(CSVSchema &file_schema); + SnifferResult AdaptiveSniff(const CSVSchema &file_schema); //! Function that only sniffs the first two rows, to verify if a header exists and what are the data types - SnifferResult MinimalSniff(); + AdaptiveSnifferResult MinimalSniff(); static NewLineIdentifier DetectNewLineDelimiter(CSVBufferManager &buffer_manager); diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp new file mode 100644 index 00000000000..a31d510330b --- /dev/null +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp" +#include "duckdb/common/vector.hpp" + + +namespace duckdb { + +//! Struct to store the result of the Sniffer +struct SnifferResult { + SnifferResult(vector return_types_p, vector names_p) + : return_types(std::move(return_types_p)), names(std::move(names_p)) { + } + //! Return Types that were detected + vector return_types; + //! Column Names that were detected + vector names; +}; + +struct AdaptiveSnifferResult: SnifferResult { + AdaptiveSnifferResult(vector return_types_p, vector names_p, bool more_than_one_row_p) + : SnifferResult(std::move(return_types_p),std::move(names_p)), more_than_one_row(more_than_one_row_p) { + } + bool more_than_one_row; +}; +} \ No newline at end of file diff --git a/src/main/relation/read_csv_relation.cpp b/src/main/relation/read_csv_relation.cpp index a23091eee5a..56eb0d6f5cc 100644 --- a/src/main/relation/read_csv_relation.cpp +++ b/src/main/relation/read_csv_relation.cpp @@ -1,7 +1,7 @@ #include "duckdb/main/relation/read_csv_relation.hpp" #include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp" -#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp" +#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/parser/expression/columnref_expression.hpp" #include "duckdb/parser/expression/comparison_expression.hpp" #include "duckdb/parser/expression/constant_expression.hpp" diff --git a/test/sql/copy/csv/glob/read_csv_glob.test b/test/sql/copy/csv/glob/read_csv_glob.test index 8b7b693c42a..784bef427a2 100644 --- a/test/sql/copy/csv/glob/read_csv_glob.test +++ b/test/sql/copy/csv/glob/read_csv_glob.test @@ -5,6 +5,12 @@ statement ok PRAGMA enable_verification +query III +FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; +---- +2000-01-01 10 80.9189441112103 +2000-01-02 5 109.16581782022259 + # Globbing with different dialects query III FROM 'data/csv/glob_dif_dialect/f_*.csv' @@ -17,6 +23,11 @@ FROM 'data/csv/glob_dif_dialect/f_*.csv' 4 pedro pedro@email.com 5 tim tim@email.com +query III +FROM read_csv(['data/csv/glob_dif_dialect/f_1.csv', 'data/csv/glob_dif_dialect/empty.csv']) +---- + + # simple globbing query I SELECT * FROM read_csv('data/csv/glob/a?/*.csv') ORDER BY 1 From 7aef75b7a2b31a5329e40a9eaa96335d2360f8cb Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 30 Sep 2024 15:01:38 +0200 Subject: [PATCH 02/20] More tests/fixes --- .../csv_scanner/scanner/csv_schema.cpp | 14 +++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 6 +-- .../operator/csv_scanner/csv_schema.hpp | 4 +- .../csv_scanner/sniffer/sniff_result.hpp | 7 ++- test/sql/copy/csv/glob/read_csv_glob.test | 23 ---------- .../sql/copy/csv/glob/test_unmatch_globs.test | 24 ++++++++++ test/sql/copy/csv/test_mismatch_schemas.test | 44 +++++++++---------- 7 files changed, 61 insertions(+), 61 deletions(-) create mode 100644 test/sql/copy/csv/glob/test_unmatch_globs.test diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index 1eb0b218974..b2fe12f4ef9 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -60,8 +60,8 @@ bool CSVSchema::Empty() const { return columns.empty(); } -bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_result, - const string &cur_file_path, bool is_minimal_sniffer) const{ +bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_result, const string &cur_file_path, + bool is_minimal_sniffer) const { D_ASSERT(sniffer_result->names.size() == sniffer_result->return_types.size()); bool match = true; unordered_map current_schema; @@ -71,7 +71,7 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_resul current_schema[sniffer_result->names[i]] = {sniffer_result->return_types[i], i}; } if (is_minimal_sniffer) { - auto min_sniffer = static_cast(sniffer_result); + auto min_sniffer = static_cast(sniffer_result); if (!min_sniffer->more_than_one_row) { bool min_sniff_match = true; // If we don't have more than one row, either the names must match or the types must match. @@ -86,14 +86,16 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_resul } // Otherwise, the types must match. min_sniff_match = true; - if (sniffer_result->return_types.size() == current_schema.size()) { + if (sniffer_result->return_types.size() == columns.size()) { idx_t return_type_idx = 0; - for (auto &column : current_schema) { - if (column.second.type != sniffer_result->return_types[return_type_idx++]) { + for (auto &column : columns) { + if (column.type != sniffer_result->return_types[return_type_idx++]) { min_sniff_match = false; break; } } + } else { + min_sniff_match = false; } if (min_sniff_match) { return true; diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index 935aa27136b..a40c6724818 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -153,10 +153,9 @@ AdaptiveSnifferResult CSVSniffer::MinimalSniff() { detected_types.push_back(d_type); } - return {detected_types, names, sniffed_column_counts.result_position > 2}; + return {detected_types, names, sniffed_column_counts.result_position > 1}; } - SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { auto min_sniff_res = MinimalSniff(); bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors(); @@ -165,8 +164,7 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { // If we got no errors, we also run full if schemas do not match. if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; - run_full = - !file_schema.SchemasMatch(error, &min_sniff_res, options.file_path, true); + run_full = !file_schema.SchemasMatch(error, &min_sniff_res, options.file_path, true); } } if (run_full) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp index e2693517e2c..1e76c3a13f2 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp @@ -24,8 +24,8 @@ struct CSVColumnInfo { struct CSVSchema { void Initialize(vector &names, vector &types, const string &file_path); bool Empty() const; - bool SchemasMatch(string &error_message, SnifferResult *sniffer_resilt, - const string &cur_file_path, bool is_minimal_sniffer) const; + bool SchemasMatch(string &error_message, SnifferResult *sniffer_resilt, const string &cur_file_path, + bool is_minimal_sniffer) const; private: static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination); diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp index a31d510330b..f7f2fe6d260 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -11,7 +11,6 @@ #include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp" #include "duckdb/common/vector.hpp" - namespace duckdb { //! Struct to store the result of the Sniffer @@ -25,10 +24,10 @@ struct SnifferResult { vector names; }; -struct AdaptiveSnifferResult: SnifferResult { +struct AdaptiveSnifferResult : SnifferResult { AdaptiveSnifferResult(vector return_types_p, vector names_p, bool more_than_one_row_p) - : SnifferResult(std::move(return_types_p),std::move(names_p)), more_than_one_row(more_than_one_row_p) { + : SnifferResult(std::move(return_types_p), std::move(names_p)), more_than_one_row(more_than_one_row_p) { } bool more_than_one_row; }; -} \ No newline at end of file +} // namespace duckdb diff --git a/test/sql/copy/csv/glob/read_csv_glob.test b/test/sql/copy/csv/glob/read_csv_glob.test index 784bef427a2..c69b64c67d4 100644 --- a/test/sql/copy/csv/glob/read_csv_glob.test +++ b/test/sql/copy/csv/glob/read_csv_glob.test @@ -5,29 +5,6 @@ statement ok PRAGMA enable_verification -query III -FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; ----- -2000-01-01 10 80.9189441112103 -2000-01-02 5 109.16581782022259 - -# Globbing with different dialects -query III -FROM 'data/csv/glob_dif_dialect/f_*.csv' ----- -1 alice alice@email.com -2 eve eve@email.com -3r bob NULL -1 alice alice@email.com -3 bob bob@email.com -4 pedro pedro@email.com -5 tim tim@email.com - -query III -FROM read_csv(['data/csv/glob_dif_dialect/f_1.csv', 'data/csv/glob_dif_dialect/empty.csv']) ----- - - # simple globbing query I SELECT * FROM read_csv('data/csv/glob/a?/*.csv') ORDER BY 1 diff --git a/test/sql/copy/csv/glob/test_unmatch_globs.test b/test/sql/copy/csv/glob/test_unmatch_globs.test new file mode 100644 index 00000000000..8770f6d39ec --- /dev/null +++ b/test/sql/copy/csv/glob/test_unmatch_globs.test @@ -0,0 +1,24 @@ +# name: test/sql/copy/csv/glob/test_unmatch_globs.test +# description: Test globbing CSVs +# group: [glob] + +statement ok +PRAGMA enable_verification + +#query III +#FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; +#---- +#2000-01-01 10 80.9189441112103 +#2000-01-02 5 109.16581782022259 + +# Globbing with different dialects +query III +FROM 'data/csv/glob_dif_dialect/f_*.csv' +---- +1 alice alice@email.com +2 eve eve@email.com +3r bob NULL +1 alice alice@email.com +3 bob bob@email.com +4 pedro pedro@email.com +5 tim tim@email.com diff --git a/test/sql/copy/csv/test_mismatch_schemas.test b/test/sql/copy/csv/test_mismatch_schemas.test index 57b73edc634..572a297acb8 100644 --- a/test/sql/copy/csv/test_mismatch_schemas.test +++ b/test/sql/copy/csv/test_mismatch_schemas.test @@ -5,28 +5,28 @@ statement ok PRAGMA enable_verification -# We will use the header from the first file (the one identified in the binder) as our schema. If other files have extra columns, these columns will be ignored -query III -SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', -'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_4.csv']) -ORDER BY ALL; ----- -1 2 3 -1 2 3 -1 2 3 -1 2 3 -1 2 3 -1 2 3 -1 2 3 -1 2 3 - -# If there is a mismatch, all files must have a header, or error. -statement error -SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', -'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_no_header.csv']) -ORDER BY ALL; ----- -Current file: data/csv/multiple_files/more_columns/file_no_header.csv +## We will use the header from the first file (the one identified in the binder) as our schema. If other files have extra columns, these columns will be ignored +#query III +#SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', +#'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_4.csv']) +#ORDER BY ALL; +#---- +#1 2 3 +#1 2 3 +#1 2 3 +#1 2 3 +#1 2 3 +#1 2 3 +#1 2 3 +#1 2 3 +# +## If there is a mismatch, all files must have a header, or error. +#statement error +#SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', +#'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_no_header.csv']) +#ORDER BY ALL; +#---- +#Current file: data/csv/multiple_files/more_columns/file_no_header.csv # What if we add file 5, has 3 columns but with a name mismatch statement error From efc2b0f80eadc6a0bf497d384e985b499a378a8c Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 30 Sep 2024 15:03:50 +0200 Subject: [PATCH 03/20] fix tests --- .../sql/copy/csv/glob/test_unmatch_globs.test | 10 ++--- test/sql/copy/csv/test_mismatch_schemas.test | 44 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/test/sql/copy/csv/glob/test_unmatch_globs.test b/test/sql/copy/csv/glob/test_unmatch_globs.test index 8770f6d39ec..f267e00093e 100644 --- a/test/sql/copy/csv/glob/test_unmatch_globs.test +++ b/test/sql/copy/csv/glob/test_unmatch_globs.test @@ -5,11 +5,11 @@ statement ok PRAGMA enable_verification -#query III -#FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; -#---- -#2000-01-01 10 80.9189441112103 -#2000-01-02 5 109.16581782022259 +query III +FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; +---- +2000-01-01 10 80.9189441112103 +2000-01-02 5 109.16581782022259 # Globbing with different dialects query III diff --git a/test/sql/copy/csv/test_mismatch_schemas.test b/test/sql/copy/csv/test_mismatch_schemas.test index 572a297acb8..57b73edc634 100644 --- a/test/sql/copy/csv/test_mismatch_schemas.test +++ b/test/sql/copy/csv/test_mismatch_schemas.test @@ -5,28 +5,28 @@ statement ok PRAGMA enable_verification -## We will use the header from the first file (the one identified in the binder) as our schema. If other files have extra columns, these columns will be ignored -#query III -#SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', -#'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_4.csv']) -#ORDER BY ALL; -#---- -#1 2 3 -#1 2 3 -#1 2 3 -#1 2 3 -#1 2 3 -#1 2 3 -#1 2 3 -#1 2 3 -# -## If there is a mismatch, all files must have a header, or error. -#statement error -#SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', -#'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_no_header.csv']) -#ORDER BY ALL; -#---- -#Current file: data/csv/multiple_files/more_columns/file_no_header.csv +# We will use the header from the first file (the one identified in the binder) as our schema. If other files have extra columns, these columns will be ignored +query III +SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', +'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_4.csv']) +ORDER BY ALL; +---- +1 2 3 +1 2 3 +1 2 3 +1 2 3 +1 2 3 +1 2 3 +1 2 3 +1 2 3 + +# If there is a mismatch, all files must have a header, or error. +statement error +SELECT * FROM read_csv(['data/csv/multiple_files/more_columns/file_1.csv','data/csv/multiple_files/more_columns/file_2.csv', +'data/csv/multiple_files/more_columns/file_3.csv','data/csv/multiple_files/more_columns/file_no_header.csv']) +ORDER BY ALL; +---- +Current file: data/csv/multiple_files/more_columns/file_no_header.csv # What if we add file 5, has 3 columns but with a name mismatch statement error From 124d68becffe0d437ddd2e969726e7392fd015fa Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 30 Sep 2024 15:27:46 +0200 Subject: [PATCH 04/20] Adding a test with a single type line --- data/csv/glob_dif_dialect/14166/empty.csv | 0 data/csv/glob_dif_dialect/14166/matching_types.csv | 1 + .../operator/csv_scanner/scanner/csv_schema.cpp | 5 +++++ test/sql/copy/csv/glob/test_unmatch_globs.test | 12 ++++++++++++ 4 files changed, 18 insertions(+) create mode 100644 data/csv/glob_dif_dialect/14166/empty.csv create mode 100644 data/csv/glob_dif_dialect/14166/matching_types.csv diff --git a/data/csv/glob_dif_dialect/14166/empty.csv b/data/csv/glob_dif_dialect/14166/empty.csv new file mode 100644 index 00000000000..e69de29bb2d diff --git a/data/csv/glob_dif_dialect/14166/matching_types.csv b/data/csv/glob_dif_dialect/14166/matching_types.csv new file mode 100644 index 00000000000..87f3c093dad --- /dev/null +++ b/data/csv/glob_dif_dialect/14166/matching_types.csv @@ -0,0 +1 @@ +2003-01-02,5,109.16581782022259 \ No newline at end of file diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index b2fe12f4ef9..b1e9399eeb5 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -98,6 +98,11 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_resul min_sniff_match = false; } if (min_sniff_match) { + // If we got here, we have the right types but the wrong names, lets fix the names + idx_t sniff_name_idx = 0; + for (auto &column : columns) { + sniffer_result->names[sniff_name_idx++] = column.name; + } return true; } } diff --git a/test/sql/copy/csv/glob/test_unmatch_globs.test b/test/sql/copy/csv/glob/test_unmatch_globs.test index f267e00093e..3b9e2c0761a 100644 --- a/test/sql/copy/csv/glob/test_unmatch_globs.test +++ b/test/sql/copy/csv/glob/test_unmatch_globs.test @@ -11,6 +11,18 @@ FROM 'data/csv/glob_dif_dialect/14166/__200*.csv'; 2000-01-01 10 80.9189441112103 2000-01-02 5 109.16581782022259 +query III +FROM read_csv(['data/csv/glob_dif_dialect/14166/__2000.csv', 'data/csv/glob_dif_dialect/14166/__2001.csv', 'data/csv/glob_dif_dialect/14166/empty.csv']); +---- +2000-01-01 10 80.9189441112103 +2000-01-02 5 109.16581782022259 + +query III +FROM read_csv(['data/csv/glob_dif_dialect/14166/__2000.csv','data/csv/glob_dif_dialect/14166/matching_types.csv']); +---- +2000-01-01 10 80.9189441112103 +2000-01-02 5 109.16581782022259 + # Globbing with different dialects query III FROM 'data/csv/glob_dif_dialect/f_*.csv' From 7894773caa72a33a0d227f2abb7c7861b4860c11 Mon Sep 17 00:00:00 2001 From: pdet Date: Mon, 30 Sep 2024 16:37:45 +0200 Subject: [PATCH 05/20] avoid slicing --- src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp | 2 +- .../execution/operator/csv_scanner/sniffer/sniff_result.hpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index a40c6724818..d3017f111a5 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -179,7 +179,7 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { } return full_sniffer; } - return static_cast(min_sniff_res); + return min_sniff_res.ToSnifferResult(); } SnifferResult CSVSniffer::SniffCSV(bool force_match) { buffer_manager->sniffing = true; diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp index f7f2fe6d260..4d3b18932b8 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -29,5 +29,8 @@ struct AdaptiveSnifferResult : SnifferResult { : SnifferResult(std::move(return_types_p), std::move(names_p)), more_than_one_row(more_than_one_row_p) { } bool more_than_one_row; + SnifferResult ToSnifferResult() { + return {return_types,names}; + } }; } // namespace duckdb From c4f27cca3b259a0509ac771f2c7ed58916d021f7 Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 2 Oct 2024 14:26:12 +0200 Subject: [PATCH 06/20] Format --- .../execution/operator/csv_scanner/sniffer/sniff_result.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp index 4d3b18932b8..2a8e0de19fa 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -30,7 +30,7 @@ struct AdaptiveSnifferResult : SnifferResult { } bool more_than_one_row; SnifferResult ToSnifferResult() { - return {return_types,names}; + return {return_types, names}; } }; } // namespace duckdb From 53fec90563acd99dc27fe51e2ae50f9e304f4da7 Mon Sep 17 00:00:00 2001 From: pdet Date: Thu, 10 Oct 2024 12:19:38 +0200 Subject: [PATCH 07/20] Remove raw ptr --- .../csv_scanner/scanner/csv_schema.cpp | 10 +++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 33 +++++++++++++------ .../operator/csv_scanner/csv_schema.hpp | 2 +- .../csv_scanner/sniffer/csv_sniffer.hpp | 5 +-- .../csv_scanner/sniffer/sniff_result.hpp | 3 +- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index b1e9399eeb5..fc6115cdc5f 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -60,8 +60,10 @@ bool CSVSchema::Empty() const { return columns.empty(); } -bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_result, const string &cur_file_path, - bool is_minimal_sniffer) const { +bool CSVSchema::SchemasMatch(string &error_message, weak_ptr sniffer_result_ptr, + const string &cur_file_path, bool is_minimal_sniffer) const { + auto sniffer_result = sniffer_result_ptr.lock(); + D_ASSERT(sniffer_result); D_ASSERT(sniffer_result->names.size() == sniffer_result->return_types.size()); bool match = true; unordered_map current_schema; @@ -71,8 +73,8 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult *sniffer_resul current_schema[sniffer_result->names[i]] = {sniffer_result->return_types[i], i}; } if (is_minimal_sniffer) { - auto min_sniffer = static_cast(sniffer_result); - if (!min_sniffer->more_than_one_row) { + auto min_sniffer = static_cast(sniffer_result.get()); + if (min_sniffer && !min_sniffer->more_than_one_row) { bool min_sniff_match = true; // If we don't have more than one row, either the names must match or the types must match. for (auto &column : columns) { diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index d3017f111a5..f25d8675bde 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -88,10 +88,10 @@ void CSVSniffer::SetResultOptions() { options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header; } -AdaptiveSnifferResult CSVSniffer::MinimalSniff() { +shared_ptr CSVSniffer::MinimalSniff() { if (set_columns.IsSet()) { // Nothing to see here - return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true); + return make_shared_ptr(*set_columns.types, *set_columns.names, true); } // Return Types detected vector return_types; @@ -106,7 +106,7 @@ AdaptiveSnifferResult CSVSniffer::MinimalSniff() { auto &sniffed_column_counts = count_scanner.ParseChunk(); if (sniffed_column_counts.result_position == 0) { // The file is an empty file, we just return - return {{}, {}, false}; + return make_shared_ptr(); } state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns; @@ -153,7 +153,7 @@ AdaptiveSnifferResult CSVSniffer::MinimalSniff() { detected_types.push_back(d_type); } - return {detected_types, names, sniffed_column_counts.result_position > 1}; + return make_shared_ptr(detected_types, names, sniffed_column_counts.result_position > 1); } SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { @@ -164,24 +164,25 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { // If we got no errors, we also run full if schemas do not match. if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; - run_full = !file_schema.SchemasMatch(error, &min_sniff_res, options.file_path, true); + run_full = !file_schema.SchemasMatch(error, min_sniff_res, options.file_path, true); } } if (run_full) { // We run full sniffer - auto full_sniffer = SniffCSV(); + auto full_sniffer = SniffCSVShared(); if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; - if (!file_schema.SchemasMatch(error, &full_sniffer, options.file_path, false) && + if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) && !options.ignore_errors.GetValue()) { throw InvalidInputException(error); } } - return full_sniffer; + return *full_sniffer; } - return min_sniff_res.ToSnifferResult(); + return min_sniff_res->ToSnifferResult(); } -SnifferResult CSVSniffer::SniffCSV(bool force_match) { + +void CSVSniffer::SniffCSVInternal(bool force_match) { buffer_manager->sniffing = true; // 1. Dialect Detection DetectDialect(); @@ -266,10 +267,22 @@ SnifferResult CSVSniffer::SniffCSV(bool force_match) { throw InvalidInputException(error); } options.was_type_manually_set = manually_set; +} + +SnifferResult CSVSniffer::SniffCSV(bool force_match) { + SniffCSVInternal(force_match); if (set_columns.IsSet()) { return SnifferResult(*set_columns.types, *set_columns.names); } return SnifferResult(detected_types, names); } +shared_ptr CSVSniffer::SniffCSVShared(bool force_match) { + SniffCSVInternal(force_match); + if (set_columns.IsSet()) { + return make_shared_ptr(*set_columns.types, *set_columns.names); + } + return make_shared_ptr(detected_types, names); +} + } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp index 1e76c3a13f2..334ba6e41eb 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp @@ -24,7 +24,7 @@ struct CSVColumnInfo { struct CSVSchema { void Initialize(vector &names, vector &types, const string &file_path); bool Empty() const; - bool SchemasMatch(string &error_message, SnifferResult *sniffer_resilt, const string &cur_file_path, + bool SchemasMatch(string &error_message, weak_ptr sniffer_result, const string &cur_file_path, bool is_minimal_sniffer) const; private: diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp index 6de097deaa1..0b9aaa459b4 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp @@ -118,8 +118,9 @@ class CSVSniffer { //! 3. Type Refinement: Refines the types of the columns for the remaining chunks //! 4. Header Detection: Figures out if the CSV file has a header and produces the names of the columns //! 5. Type Replacement: Replaces the types of the columns if the user specified them + void SniffCSVInternal(bool force_match); + shared_ptr SniffCSVShared(bool force_match = false); SnifferResult SniffCSV(bool force_match = false); - //! I call it adaptive, since that's a sexier term. //! In practice this Function that only sniffs the first two rows, to verify if a header exists and what are the //! data types It does this considering a priorly set CSV schema. If there is a mismatch of the schema it runs the @@ -128,7 +129,7 @@ class CSVSniffer { SnifferResult AdaptiveSniff(const CSVSchema &file_schema); //! Function that only sniffs the first two rows, to verify if a header exists and what are the data types - AdaptiveSnifferResult MinimalSniff(); + shared_ptr MinimalSniff(); static NewLineIdentifier DetectNewLineDelimiter(CSVBufferManager &buffer_manager); diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp index 2a8e0de19fa..405b34f69f1 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -25,7 +25,8 @@ struct SnifferResult { }; struct AdaptiveSnifferResult : SnifferResult { - AdaptiveSnifferResult(vector return_types_p, vector names_p, bool more_than_one_row_p) + explicit AdaptiveSnifferResult(vector return_types_p = {}, vector names_p = {}, + bool more_than_one_row_p = false) : SnifferResult(std::move(return_types_p), std::move(names_p)), more_than_one_row(more_than_one_row_p) { } bool more_than_one_row; From e7759b9c12b8b7350367887cee61177478ad7fb3 Mon Sep 17 00:00:00 2001 From: pdet Date: Thu, 10 Oct 2024 17:22:31 +0200 Subject: [PATCH 08/20] clang --- src/execution/operator/csv_scanner/scanner/csv_schema.cpp | 2 +- .../duckdb/execution/operator/csv_scanner/csv_schema.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index fc6115cdc5f..52ddd07dbdc 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -60,7 +60,7 @@ bool CSVSchema::Empty() const { return columns.empty(); } -bool CSVSchema::SchemasMatch(string &error_message, weak_ptr sniffer_result_ptr, +bool CSVSchema::SchemasMatch(string &error_message, const weak_ptr &sniffer_result_ptr, const string &cur_file_path, bool is_minimal_sniffer) const { auto sniffer_result = sniffer_result_ptr.lock(); D_ASSERT(sniffer_result); diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp index 334ba6e41eb..e351cbbec9e 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp @@ -24,7 +24,7 @@ struct CSVColumnInfo { struct CSVSchema { void Initialize(vector &names, vector &types, const string &file_path); bool Empty() const; - bool SchemasMatch(string &error_message, weak_ptr sniffer_result, const string &cur_file_path, + bool SchemasMatch(string &error_message, const weak_ptr &sniffer_result, const string &cur_file_path, bool is_minimal_sniffer) const; private: From 46a0f292081670f1733c0b3b6cf4aa7d3ace2fa4 Mon Sep 17 00:00:00 2001 From: pdet Date: Fri, 11 Oct 2024 16:57:01 +0200 Subject: [PATCH 09/20] Referenec --- .../csv_scanner/scanner/csv_schema.cpp | 22 +++++++------- .../csv_scanner/sniffer/csv_sniffer.cpp | 29 +++++-------------- .../operator/csv_scanner/csv_schema.hpp | 2 +- .../csv_scanner/sniffer/csv_sniffer.hpp | 5 ++-- .../csv_scanner/sniffer/sniff_result.hpp | 3 +- 5 files changed, 22 insertions(+), 39 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp index 52ddd07dbdc..139398d7180 100644 --- a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +++ b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp @@ -60,21 +60,19 @@ bool CSVSchema::Empty() const { return columns.empty(); } -bool CSVSchema::SchemasMatch(string &error_message, const weak_ptr &sniffer_result_ptr, - const string &cur_file_path, bool is_minimal_sniffer) const { - auto sniffer_result = sniffer_result_ptr.lock(); - D_ASSERT(sniffer_result); - D_ASSERT(sniffer_result->names.size() == sniffer_result->return_types.size()); +bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path, + bool is_minimal_sniffer) const { + D_ASSERT(sniffer_result.names.size() == sniffer_result.return_types.size()); bool match = true; unordered_map current_schema; - for (idx_t i = 0; i < sniffer_result->names.size(); i++) { + for (idx_t i = 0; i < sniffer_result.names.size(); i++) { // Populate our little schema - current_schema[sniffer_result->names[i]] = {sniffer_result->return_types[i], i}; + current_schema[sniffer_result.names[i]] = {sniffer_result.return_types[i], i}; } if (is_minimal_sniffer) { - auto min_sniffer = static_cast(sniffer_result.get()); - if (min_sniffer && !min_sniffer->more_than_one_row) { + auto min_sniffer = static_cast(sniffer_result); + if (!min_sniffer.more_than_one_row) { bool min_sniff_match = true; // If we don't have more than one row, either the names must match or the types must match. for (auto &column : columns) { @@ -88,10 +86,10 @@ bool CSVSchema::SchemasMatch(string &error_message, const weak_ptrreturn_types.size() == columns.size()) { + if (sniffer_result.return_types.size() == columns.size()) { idx_t return_type_idx = 0; for (auto &column : columns) { - if (column.type != sniffer_result->return_types[return_type_idx++]) { + if (column.type != sniffer_result.return_types[return_type_idx++]) { min_sniff_match = false; break; } @@ -103,7 +101,7 @@ bool CSVSchema::SchemasMatch(string &error_message, const weak_ptrnames[sniff_name_idx++] = column.name; + sniffer_result.names[sniff_name_idx++] = column.name; } return true; } diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index f25d8675bde..950d74891b0 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -88,10 +88,10 @@ void CSVSniffer::SetResultOptions() { options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header; } -shared_ptr CSVSniffer::MinimalSniff() { +AdaptiveSnifferResult CSVSniffer::MinimalSniff() { if (set_columns.IsSet()) { // Nothing to see here - return make_shared_ptr(*set_columns.types, *set_columns.names, true); + return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true); } // Return Types detected vector return_types; @@ -106,7 +106,7 @@ shared_ptr CSVSniffer::MinimalSniff() { auto &sniffed_column_counts = count_scanner.ParseChunk(); if (sniffed_column_counts.result_position == 0) { // The file is an empty file, we just return - return make_shared_ptr(); + return {{}, {}, false}; } state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns; @@ -153,7 +153,7 @@ shared_ptr CSVSniffer::MinimalSniff() { detected_types.push_back(d_type); } - return make_shared_ptr(detected_types, names, sniffed_column_counts.result_position > 1); + return {detected_types, names, sniffed_column_counts.result_position > 1}; } SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { @@ -169,7 +169,7 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { } if (run_full) { // We run full sniffer - auto full_sniffer = SniffCSVShared(); + auto full_sniffer = SniffCSV(); if (!set_columns.IsSet() && !options.file_options.AnySet()) { string error; if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) && @@ -177,12 +177,11 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { throw InvalidInputException(error); } } - return *full_sniffer; + return full_sniffer; } - return min_sniff_res->ToSnifferResult(); + return min_sniff_res.ToSnifferResult(); } - -void CSVSniffer::SniffCSVInternal(bool force_match) { +SnifferResult CSVSniffer::SniffCSV(bool force_match) { buffer_manager->sniffing = true; // 1. Dialect Detection DetectDialect(); @@ -267,22 +266,10 @@ void CSVSniffer::SniffCSVInternal(bool force_match) { throw InvalidInputException(error); } options.was_type_manually_set = manually_set; -} - -SnifferResult CSVSniffer::SniffCSV(bool force_match) { - SniffCSVInternal(force_match); if (set_columns.IsSet()) { return SnifferResult(*set_columns.types, *set_columns.names); } return SnifferResult(detected_types, names); } -shared_ptr CSVSniffer::SniffCSVShared(bool force_match) { - SniffCSVInternal(force_match); - if (set_columns.IsSet()) { - return make_shared_ptr(*set_columns.types, *set_columns.names); - } - return make_shared_ptr(detected_types, names); -} - } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp index e351cbbec9e..39e1b809a9c 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp @@ -24,7 +24,7 @@ struct CSVColumnInfo { struct CSVSchema { void Initialize(vector &names, vector &types, const string &file_path); bool Empty() const; - bool SchemasMatch(string &error_message, const weak_ptr &sniffer_result, const string &cur_file_path, + bool SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path, bool is_minimal_sniffer) const; private: diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp index 0b9aaa459b4..6de097deaa1 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp @@ -118,9 +118,8 @@ class CSVSniffer { //! 3. Type Refinement: Refines the types of the columns for the remaining chunks //! 4. Header Detection: Figures out if the CSV file has a header and produces the names of the columns //! 5. Type Replacement: Replaces the types of the columns if the user specified them - void SniffCSVInternal(bool force_match); - shared_ptr SniffCSVShared(bool force_match = false); SnifferResult SniffCSV(bool force_match = false); + //! I call it adaptive, since that's a sexier term. //! In practice this Function that only sniffs the first two rows, to verify if a header exists and what are the //! data types It does this considering a priorly set CSV schema. If there is a mismatch of the schema it runs the @@ -129,7 +128,7 @@ class CSVSniffer { SnifferResult AdaptiveSniff(const CSVSchema &file_schema); //! Function that only sniffs the first two rows, to verify if a header exists and what are the data types - shared_ptr MinimalSniff(); + AdaptiveSnifferResult MinimalSniff(); static NewLineIdentifier DetectNewLineDelimiter(CSVBufferManager &buffer_manager); diff --git a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp index 405b34f69f1..2a8e0de19fa 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp @@ -25,8 +25,7 @@ struct SnifferResult { }; struct AdaptiveSnifferResult : SnifferResult { - explicit AdaptiveSnifferResult(vector return_types_p = {}, vector names_p = {}, - bool more_than_one_row_p = false) + AdaptiveSnifferResult(vector return_types_p, vector names_p, bool more_than_one_row_p) : SnifferResult(std::move(return_types_p), std::move(names_p)), more_than_one_row(more_than_one_row_p) { } bool more_than_one_row; From cf510e0eccfdede3abd77d064f3cd689a84470f6 Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:47:54 -0700 Subject: [PATCH 10/20] Internal #3251: DateDiff Across Epoch Integer division in C++ rounds towards 0, not down. This was impacting all resolutions hour and below. fixes: duckdblabs/duckdb-internal#3251 --- src/core_functions/scalar/date/date_diff.cpp | 18 +++-- .../timestamp/test_date_diff_epoch.test | 66 +++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 test/sql/function/timestamp/test_date_diff_epoch.test diff --git a/src/core_functions/scalar/date/date_diff.cpp b/src/core_functions/scalar/date/date_diff.cpp index 6266dda3503..36376a2b571 100644 --- a/src/core_functions/scalar/date/date_diff.cpp +++ b/src/core_functions/scalar/date/date_diff.cpp @@ -28,6 +28,14 @@ struct DateDiff { }); } + // We need to truncate down, not towards 0 + static inline int64_t Truncate(int64_t value, int64_t units) { + return (value + (value < 0)) / units - (value < 0); + } + static inline int64_t Diff(int64_t start, int64_t end, int64_t units) { + return Truncate(end, units) - Truncate(start, units); + } + struct YearOperator { template static inline TR Operation(TA startdate, TB enddate) { @@ -204,30 +212,28 @@ template <> int64_t DateDiff::MillisecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) { D_ASSERT(Timestamp::IsFinite(startdate)); D_ASSERT(Timestamp::IsFinite(enddate)); - return Timestamp::GetEpochMs(enddate) - Timestamp::GetEpochMs(startdate); + return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MSEC); } template <> int64_t DateDiff::SecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) { D_ASSERT(Timestamp::IsFinite(startdate)); D_ASSERT(Timestamp::IsFinite(enddate)); - return Timestamp::GetEpochSeconds(enddate) - Timestamp::GetEpochSeconds(startdate); + return Diff(startdate.value, enddate.value, Interval::MICROS_PER_SEC); } template <> int64_t DateDiff::MinutesOperator::Operation(timestamp_t startdate, timestamp_t enddate) { D_ASSERT(Timestamp::IsFinite(startdate)); D_ASSERT(Timestamp::IsFinite(enddate)); - return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_MINUTE - - Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_MINUTE; + return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MINUTE); } template <> int64_t DateDiff::HoursOperator::Operation(timestamp_t startdate, timestamp_t enddate) { D_ASSERT(Timestamp::IsFinite(startdate)); D_ASSERT(Timestamp::IsFinite(enddate)); - return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_HOUR - - Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_HOUR; + return Diff(startdate.value, enddate.value, Interval::MICROS_PER_HOUR); } // TIME specialisations diff --git a/test/sql/function/timestamp/test_date_diff_epoch.test b/test/sql/function/timestamp/test_date_diff_epoch.test new file mode 100644 index 00000000000..4402462e3be --- /dev/null +++ b/test/sql/function/timestamp/test_date_diff_epoch.test @@ -0,0 +1,66 @@ +# name: test/sql/function/timestamp/test_date_diff_epoch.test +# description: Test equal period timestamp diffs across the Unix epoch +# group: [timestamp] + +statement ok +PRAGMA enable_verification + +query III +SELECT + start_ts, + end_ts, + DATEDIFF('day', start_ts, end_ts) AS dd_hour +FROM VALUES ( + '1970-01-03 12:12:12'::TIMESTAMP, + '1969-12-25 05:05:05'::TIMESTAMP + ) x(start_ts, end_ts); +---- +1970-01-03 12:12:12 1969-12-25 05:05:05 -9 + +query III +SELECT + start_ts, + end_ts, + DATEDIFF('hour', start_ts, end_ts) AS dd_hour +FROM VALUES ( + '1970-01-01 12:12:12'::TIMESTAMP, + '1969-12-31 05:05:05'::TIMESTAMP + ) x(start_ts, end_ts); +---- +1970-01-01 12:12:12 1969-12-31 05:05:05 -31 + +query III +SELECT + start_ts, + end_ts, + DATEDIFF('minute', start_ts, end_ts) AS dd_minute +FROM VALUES ( + '1970-01-01 00:12:12'::TIMESTAMP, + '1969-12-31 23:05:05'::TIMESTAMP + ) x(start_ts, end_ts); +---- +1970-01-01 00:12:12 1969-12-31 23:05:05 -67 + +query III +SELECT + start_ts, + end_ts, + DATEDIFF('second', start_ts, end_ts) AS dd_second +FROM VALUES ( + '1970-01-01 00:00:12.456'::TIMESTAMP, + '1969-12-31 23:59:05.123'::TIMESTAMP + ) x(start_ts, end_ts); +---- +1970-01-01 00:00:12.456 1969-12-31 23:59:05.123 -67 + +query III +SELECT + start_ts, + end_ts, + DATEDIFF('millisecond', start_ts, end_ts) AS dd_second +FROM VALUES ( + '1970-01-01 00:00:12.456789'::TIMESTAMP, + '1969-12-31 23:59:05.123456'::TIMESTAMP + ) x(start_ts, end_ts); +---- +1970-01-01 00:00:12.456789 1969-12-31 23:59:05.123456 -67333 From 00848e12a928864f2c8e700594269472ba4d704e Mon Sep 17 00:00:00 2001 From: Tishj Date: Sat, 12 Oct 2024 19:24:43 +0200 Subject: [PATCH 11/20] add test with generated columns --- src/include/duckdb/main/table_description.hpp | 14 +++++ src/main/appender.cpp | 3 + src/main/client_context.cpp | 10 ++- test/appender/test_appender.cpp | 63 +++++++++++++++++++ 4 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/include/duckdb/main/table_description.hpp b/src/include/duckdb/main/table_description.hpp index 2a35b4f1669..151592f4935 100644 --- a/src/include/duckdb/main/table_description.hpp +++ b/src/include/duckdb/main/table_description.hpp @@ -13,12 +13,26 @@ namespace duckdb { struct TableDescription { +public: //! The schema of the table string schema; //! The table name of the table string table; //! The columns of the table vector columns; + +public: + idx_t PhysicalColumnCount() const { + idx_t count = 0; + for (auto &column : columns) { + if (column.Generated()) { + continue; + } + count++; + } + D_ASSERT(count != 0); + return count; + } }; } // namespace duckdb diff --git a/src/main/appender.cpp b/src/main/appender.cpp index ae360852414..33ef17bf444 100644 --- a/src/main/appender.cpp +++ b/src/main/appender.cpp @@ -62,6 +62,9 @@ Appender::Appender(Connection &con, const string &schema_name, const string &tab } vector> defaults; for (auto &column : description->columns) { + if (column.Generated()) { + continue; + } types.push_back(column.Type()); defaults.push_back(column.HasDefaultValue() ? &column.DefaultValue() : nullptr); } diff --git a/src/main/client_context.cpp b/src/main/client_context.cpp index 467f28dacde..90c029ab498 100644 --- a/src/main/client_context.cpp +++ b/src/main/client_context.cpp @@ -1132,13 +1132,19 @@ void ClientContext::Append(TableDescription &description, ColumnDataCollection & auto &table_entry = Catalog::GetEntry(*this, INVALID_CATALOG, description.schema, description.table); // verify that the table columns and types match up - if (description.columns.size() != table_entry.GetColumns().PhysicalColumnCount()) { + if (description.PhysicalColumnCount() != table_entry.GetColumns().PhysicalColumnCount()) { throw InvalidInputException("Failed to append: table entry has different number of columns!"); } + idx_t table_entry_col_idx = 0; for (idx_t i = 0; i < description.columns.size(); i++) { - if (description.columns[i].Type() != table_entry.GetColumns().GetColumn(PhysicalIndex(i)).Type()) { + auto &column = description.columns[i]; + if (column.Generated()) { + continue; + } + if (column.Type() != table_entry.GetColumns().GetColumn(PhysicalIndex(table_entry_col_idx)).Type()) { throw InvalidInputException("Failed to append: table entry has different number of columns!"); } + table_entry_col_idx++; } auto binder = Binder::CreateBinder(*this); auto bound_constraints = binder->BindConstraints(table_entry); diff --git a/test/appender/test_appender.cpp b/test/appender/test_appender.cpp index bb659982e2c..e20d498c883 100644 --- a/test/appender/test_appender.cpp +++ b/test/appender/test_appender.cpp @@ -169,6 +169,69 @@ TEST_CASE("Test AppendRow", "[appender]") { REQUIRE(CHECK_COLUMN(result, 2, {Value::TIMESTAMP(1992, 1, 1, 1, 1, 1, 0)})); } +TEST_CASE("Test appender with generated column", "[appender]") { + DuckDB db(nullptr); // Create an in-memory DuckDB database + Connection con(db); // Create a connection to the database + + SECTION("Insert into table with generated column first") { + // Try to create a table with a generated column + REQUIRE_NOTHROW(con.Query(R"( + CREATE TABLE tbl ( + b VARCHAR GENERATED ALWAYS AS (a), + a VARCHAR + ) + )")); + + Appender appender(con, "tbl"); + REQUIRE_NOTHROW(appender.BeginRow()); + REQUIRE_NOTHROW(appender.Append("a")); + + // Column 'b' is generated from 'a', so it does not need to be explicitly appended + // End the row + REQUIRE_NOTHROW(appender.EndRow()); + + // Close the appender + REQUIRE_NOTHROW(appender.Close()); + + // Query the table to verify that the row was inserted correctly + auto result = con.Query("SELECT * FROM tbl"); + REQUIRE_NO_FAIL(*result); + + // Check that the column 'a' contains "a" and 'b' contains the generated value "a" + REQUIRE(CHECK_COLUMN(result, 0, {Value("a")})); + REQUIRE(CHECK_COLUMN(result, 1, {Value("a")})); + } + + SECTION("Insert into table with generated column second") { + // Try to create a table with a generated column + REQUIRE_NOTHROW(con.Query(R"( + CREATE TABLE tbl ( + a VARCHAR, + b VARCHAR GENERATED ALWAYS AS (a) + ) + )")); + + Appender appender(con, "tbl"); + REQUIRE_NOTHROW(appender.BeginRow()); + REQUIRE_NOTHROW(appender.Append("a")); + + // Column 'b' is generated from 'a', so it does not need to be explicitly appended + // End the row + REQUIRE_NOTHROW(appender.EndRow()); + + // Close the appender + REQUIRE_NOTHROW(appender.Close()); + + // Query the table to verify that the row was inserted correctly + auto result = con.Query("SELECT * FROM tbl"); + REQUIRE_NO_FAIL(*result); + + // Check that the column 'a' contains "a" and 'b' contains the generated value "a" + REQUIRE(CHECK_COLUMN(result, 0, {Value("a")})); + REQUIRE(CHECK_COLUMN(result, 1, {Value("a")})); + } +} + TEST_CASE("Test default value appender", "[appender]") { duckdb::unique_ptr result; DuckDB db(nullptr); From 4d78c47390720a37692f1b623703c8a930c58f0b Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Sat, 12 Oct 2024 21:50:45 +0200 Subject: [PATCH 12/20] Never generate maps if map_inference_threshold is -1 --- extension/json/json_functions/json_structure.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extension/json/json_functions/json_structure.cpp b/extension/json/json_functions/json_structure.cpp index d83ccf33bfb..7982003fc13 100644 --- a/extension/json/json_functions/json_structure.cpp +++ b/extension/json/json_functions/json_structure.cpp @@ -716,7 +716,8 @@ static LogicalType StructureToTypeObject(ClientContext &context, const JSONStruc } // If it's an inconsistent object we also just do MAP with the best-possible, recursively-merged value type - if (IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) { + if (map_inference_threshold != DConstants::INVALID_INDEX && + IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) { return LogicalType::MAP(LogicalType::VARCHAR, GetMergedType(context, node, max_depth, field_appearance_threshold, map_inference_threshold, depth + 1, null_type)); From a736a346d9d5e4d16959eda64cece5d13787d1a3 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Sun, 13 Oct 2024 14:29:13 +0200 Subject: [PATCH 13/20] Bump spatial to 3f94d52aa9f7d67b1a30e6cea642bbb790c04aa2 --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 7a7a82fb065..ef5605066c4 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -101,7 +101,7 @@ endif() duckdb_extension_load(spatial DONT_LINK LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_spatial.git - GIT_TAG bb9c829693965f029eb5a312aefed4c538fad781 + GIT_TAG 3f94d52aa9f7d67b1a30e6cea642bbb790c04aa2 INCLUDE_DIR spatial/include TEST_DIR test/sql ) From bba96ac0d56d2ad33ba57714930b1ad2c0ecc94a Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Sun, 13 Oct 2024 14:36:18 +0200 Subject: [PATCH 14/20] Bump azure and delta extensions commits --- .github/config/out_of_tree_extensions.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 7a7a82fb065..131484a3885 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -38,7 +38,7 @@ if (NOT MINGW) duckdb_extension_load(azure LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_azure - GIT_TAG b0ffe7ada20cdbd0bee2bbe5461ecd22fb468062 + GIT_TAG a40ecb7bc9036eb8ecc5bf30db935a31b78011f5 ) endif() @@ -49,7 +49,7 @@ if (NOT MINGW AND NOT "${OS_NAME}" STREQUAL "linux") duckdb_extension_load(delta LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_delta - GIT_TAG 3933ebd800ad06a64656c9aef6ca7d62897fa4db + GIT_TAG 811db25f5bd405dea186d6c461a642a387502ad8 ) endif() From 50535f75788faba2355776d73d353230dbdb3b4c Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Sun, 13 Oct 2024 14:37:29 +0200 Subject: [PATCH 15/20] Bump icebergeg to latest main --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 7a7a82fb065..18469621b60 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -73,7 +73,7 @@ if (NOT MINGW) duckdb_extension_load(iceberg ${LOAD_ICEBERG_TESTS} GIT_URL https://github.com/duckdb/duckdb_iceberg - GIT_TAG 3f6d753787252e3da1d12157910b62edf729fc6e + GIT_TAG 8b48d1261564613274ac8e9fae01e572d965c99d ) endif() From 75ac4a6307cc1025cb49090493b3a00053ea18f2 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Sun, 13 Oct 2024 14:37:44 +0200 Subject: [PATCH 16/20] Bump sqlite_scanner to latest main --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 18469621b60..2554a61bb6d 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -117,7 +117,7 @@ endif() duckdb_extension_load(sqlite_scanner ${STATIC_LINK_SQLITE} LOAD_TESTS GIT_URL https://github.com/duckdb/sqlite_scanner - GIT_TAG 315861963c8106397af36cbda10faebc8dae485a + GIT_TAG d5d62657702d33cb44a46cddc7ffc4b67bf7e961 ) duckdb_extension_load(sqlsmith From e88ac0a07dfa614c5853828f828d0e6c98237b10 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Sun, 13 Oct 2024 14:37:52 +0200 Subject: [PATCH 17/20] Bump vss to latest main --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 2554a61bb6d..befca34b15d 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -141,7 +141,7 @@ duckdb_extension_load(vss LOAD_TESTS DONT_LINK GIT_URL https://github.com/duckdb/duckdb_vss - GIT_TAG 77739ea5382cce3220af83803ac0b1e98b3ab7d8 + GIT_TAG dd880d6121c0f3dff27131e54e057c9db0f1c710 TEST_DIR test/sql ) From c1b37dea152e7197155ca7d2dd4475ae68f758cc Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Mon, 14 Oct 2024 09:10:21 +0200 Subject: [PATCH 18/20] Add profiling for extension operators --- src/main/query_profiler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/query_profiler.cpp b/src/main/query_profiler.cpp index 01c777c0a57..4f91d46d536 100644 --- a/src/main/query_profiler.cpp +++ b/src/main/query_profiler.cpp @@ -121,6 +121,7 @@ bool QueryProfiler::OperatorRequiresProfiling(PhysicalOperatorType op_type) { case PhysicalOperatorType::UNION: case PhysicalOperatorType::RECURSIVE_CTE: case PhysicalOperatorType::EMPTY_RESULT: + case PhysicalOperatorType::EXTENSION: return true; default: return false; From 28333fa97229ea134e0093c25e577172597915f3 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Tue, 15 Oct 2024 10:33:48 +0200 Subject: [PATCH 19/20] increase bounds --- test/sql/copy/file_size_bytes.test | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/sql/copy/file_size_bytes.test b/test/sql/copy/file_size_bytes.test index bfd27106dce..a16e5389786 100644 --- a/test/sql/copy/file_size_bytes.test +++ b/test/sql/copy/file_size_bytes.test @@ -135,7 +135,6 @@ SELECT count(*) > 1 FROM glob('__TEST_DIR__/file_size_bytes_csv5/*.csv') true # each thread sees ~240kb if it's balanced, what about a 190kb limit -# even in the case of extreme thread imbalance, this always yield around 8 files statement ok COPY (FROM bigdata) TO '__TEST_DIR__/file_size_bytes_csv6' (FORMAT CSV, FILE_SIZE_BYTES '190kb', PER_THREAD_OUTPUT TRUE); @@ -144,9 +143,9 @@ SELECT COUNT(*) FROM read_csv_auto('__TEST_DIR__/file_size_bytes_csv6/*.csv') ---- 100000 -# ~2 files per thread, around 8 in total +# ~2 files per thread, around 8 in total (5 output files in case of extreme thread imbalance and only 1 thread runs) query I -SELECT count(*) BETWEEN 6 AND 10 FROM glob('__TEST_DIR__/file_size_bytes_csv6/*.csv') +SELECT count(*) BETWEEN 5 AND 10 FROM glob('__TEST_DIR__/file_size_bytes_csv6/*.csv') ---- 1 From 1aac7adf60de3acf6480b529ba8abb8ed05c517d Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Tue, 15 Oct 2024 11:09:52 +0200 Subject: [PATCH 20/20] Move _rtools platform to be equivalent to _mingw --- .github/config/distribution_matrix.json | 2 +- .github/workflows/R.yml | 4 ++-- src/include/duckdb/common/platform.hpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/config/distribution_matrix.json b/.github/config/distribution_matrix.json index 5de0ce0e5b1..77c2d4dc644 100644 --- a/.github/config/distribution_matrix.json +++ b/.github/config/distribution_matrix.json @@ -39,7 +39,7 @@ "vcpkg_triplet": "x64-windows-static-md" }, { - "duckdb_arch": "windows_amd64_rtools", + "duckdb_arch": "windows_amd64_mingw", "vcpkg_triplet": "x64-mingw-static" } ] diff --git a/.github/workflows/R.yml b/.github/workflows/R.yml index 8dbfc032b0d..17fad947801 100644 --- a/.github/workflows/R.yml +++ b/.github/workflows/R.yml @@ -62,8 +62,8 @@ jobs: - uses: ./.github/actions/build_extensions with: - deploy_as: windows_amd64_rtools - duckdb_arch: windows_amd64_rtools + deploy_as: windows_amd64_mingw + duckdb_arch: windows_amd64_mingw vcpkg_target_triplet: x64-mingw-static treat_warn_as_error: 0 s3_id: ${{ secrets.S3_ID }} diff --git a/src/include/duckdb/common/platform.hpp b/src/include/duckdb/common/platform.hpp index 3166ff9684e..94eea304dec 100644 --- a/src/include/duckdb/common/platform.hpp +++ b/src/include/duckdb/common/platform.hpp @@ -48,9 +48,9 @@ std::string DuckDBPlatform() { // NOLINT: allow definition in header #ifdef __MINGW32__ postfix = "_mingw"; #endif -// this is used for the windows R builds which use a separate build environment +// this is used for the windows R builds which use `mingw` equivalent extensions #ifdef DUCKDB_PLATFORM_RTOOLS - postfix = "_rtools"; + postfix = "_mingw"; #endif return os + "_" + arch + postfix; }