diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 5caee04cfcb..6ca0fcf6b33 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -19,7 +19,7 @@ body: * [duckdb-wasm](https://github.com/duckdb/duckdb-wasm/issues/new) * [go-duckdb](https://github.com/marcboeker/go-duckdb/issues/new) * Extensions: - * [Arrow extension](https://github.com/duckdb/duckdb_arrow/issues/new) + * [Arrow extension](https://github.com/duckdb/arrow/issues/new) * [AWS extension](https://github.com/duckdb/duckdb_aws/issues/new) * [Azure extension](https://github.com/duckdb/duckdb_azure/issues/new) * [Delta extension](https://github.com/duckdb/duckdb_delta/issues/new) diff --git a/benchmark/micro/list/list_extract.benchmark b/benchmark/micro/list/list_extract.benchmark new file mode 100644 index 00000000000..609233ff602 --- /dev/null +++ b/benchmark/micro/list/list_extract.benchmark @@ -0,0 +1,16 @@ +# name: benchmark/micro/list/list_extract.benchmark +# description: Benchmark for the list_extract function +# group: [list] + +name list_extract micro +group micro +subgroup list + +load +CREATE TABLE t1 as SELECT range(0,1000) as l FROM range(0,10000) as r(e); + +run +SELECT sum(list_extract(l, 500)) FROM t1; + +result I +4990000 \ No newline at end of file diff --git a/benchmark/micro/list/list_extract_null.benchmark b/benchmark/micro/list/list_extract_null.benchmark new file mode 100644 index 00000000000..c58ceba22da --- /dev/null +++ b/benchmark/micro/list/list_extract_null.benchmark @@ -0,0 +1,16 @@ +# name: benchmark/micro/list/list_extract_null.benchmark +# description: Benchmark for the list_extract function +# group: [list] + +name list_extract micro +group micro +subgroup list + +load +CREATE TABLE t1 as SELECT list_transform(range(0,1000), a -> if(e % a = 0, null, a)) as l FROM range(0,10000) as r(e); + +run +SELECT count(list_extract(l, 5)) FROM t1; + +result I +7500 \ No newline at end of file diff --git a/benchmark/micro/list/list_extract_struct.benchmark b/benchmark/micro/list/list_extract_struct.benchmark new file mode 100644 index 00000000000..d95505fe43f --- /dev/null +++ b/benchmark/micro/list/list_extract_struct.benchmark @@ -0,0 +1,17 @@ +# name: benchmark/micro/list/list_extract_struct.benchmark +# description: Benchmark for the list_extract function +# group: [list] + +name list_extract micro +group micro +subgroup list + +load +CREATE TABLE t1 as SELECT list_transform(range(0,1000), x -> {'foo': x, 'bar': (-x)::VARCHAR}) as l +FROM range(0,10000) as r(e); + +run +SELECT sum(list_extract(l, 500).foo) FROM t1; + +result I +4990000 \ No newline at end of file diff --git a/benchmark/micro/list/list_extract_struct_null.benchmark b/benchmark/micro/list/list_extract_struct_null.benchmark new file mode 100644 index 00000000000..e33f14aab79 --- /dev/null +++ b/benchmark/micro/list/list_extract_struct_null.benchmark @@ -0,0 +1,17 @@ +# name: benchmark/micro/list/list_extract_struct_null.benchmark +# description: Benchmark for the list_extract function +# group: [list] + +name list_extract micro +group micro +subgroup list + +load +CREATE TABLE t1 as SELECT list_transform(range(0,1000), x -> if(e % x = 0, null, {'foo': x, 'bar': (-x)::VARCHAR})) as l +FROM range(0,10000) as r(e); + +run +SELECT sum(list_extract(l, 500).foo) FROM t1; + +result I +4979521 \ No newline at end of file diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index ba1476456a7..651a4f0abb1 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -1120,8 +1120,10 @@ static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids, } names += name.first; } - throw BinderException("Column name \"%s\" specified in FIELD_IDS not found. Available column names: [%s]", - col_name, names); + throw BinderException( + "Column name \"%s\" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this " + "column is a partition column. Available column names: [%s]", + col_name, names); } D_ASSERT(field_ids.ids->find(col_name) == field_ids.ids->end()); // Caught by STRUCT - deduplicates keys diff --git a/src/common/enum_util.cpp b/src/common/enum_util.cpp index 3e34b0feeda..5c12d0538df 100644 --- a/src/common/enum_util.cpp +++ b/src/common/enum_util.cpp @@ -4307,6 +4307,8 @@ const char* EnumUtil::ToChars(MetricsType value) { return "CPU_TIME"; case MetricsType::EXTRA_INFO: return "EXTRA_INFO"; + case MetricsType::CUMULATIVE_CARDINALITY: + return "CUMULATIVE_CARDINALITY"; case MetricsType::OPERATOR_CARDINALITY: return "OPERATOR_CARDINALITY"; case MetricsType::OPERATOR_TIMING: @@ -4324,6 +4326,9 @@ MetricsType EnumUtil::FromString(const char *value) { if (StringUtil::Equals(value, "EXTRA_INFO")) { return MetricsType::EXTRA_INFO; } + if (StringUtil::Equals(value, "CUMULATIVE_CARDINALITY")) { + return MetricsType::CUMULATIVE_CARDINALITY; + } if (StringUtil::Equals(value, "OPERATOR_CARDINALITY")) { return MetricsType::OPERATOR_CARDINALITY; } diff --git a/src/common/render_tree.cpp b/src/common/render_tree.cpp index b274b37ba3e..fc1f285b96b 100644 --- a/src/common/render_tree.cpp +++ b/src/common/render_tree.cpp @@ -118,14 +118,19 @@ static unique_ptr CreateNode(const PipelineRenderNode &op) { static unique_ptr CreateNode(const ProfilingNode &op) { InsertionOrderPreservingMap extra_info; if (op.GetProfilingInfo().Enabled(MetricsType::EXTRA_INFO)) { - extra_info = op.GetProfilingInfo().metrics.extra_info; + extra_info = op.GetProfilingInfo().extra_info; } auto node_name = op.GetName(); auto result = make_uniq(node_name, extra_info); - result->extra_text["Cardinality"] = to_string(op.GetProfilingInfo().metrics.operator_cardinality); - string timing = StringUtil::Format("%.2f", op.GetProfilingInfo().metrics.operator_timing); - result->extra_text["Timing"] = timing + "s"; + if (op.GetProfilingInfo().Enabled(MetricsType::OPERATOR_CARDINALITY)) { + result->extra_text["Cardinality"] = op.GetProfilingInfo().GetMetricAsString(MetricsType::OPERATOR_CARDINALITY); + } + if (op.GetProfilingInfo().Enabled(MetricsType::OPERATOR_TIMING)) { + string timing = StringUtil::Format( + "%.2f", op.GetProfilingInfo().metrics.at(MetricsType::OPERATOR_TIMING).GetValue()); + result->extra_text["Timing"] = timing + "s"; + } return result; } diff --git a/src/execution/operator/persistent/physical_copy_to_file.cpp b/src/execution/operator/persistent/physical_copy_to_file.cpp index 7123ea0d2df..fece3ef0188 100644 --- a/src/execution/operator/persistent/physical_copy_to_file.cpp +++ b/src/execution/operator/persistent/physical_copy_to_file.cpp @@ -7,6 +7,7 @@ #include "duckdb/common/types/uuid.hpp" #include "duckdb/common/value_operations/value_operations.hpp" #include "duckdb/common/vector_operations/vector_operations.hpp" +#include "duckdb/planner/operator/logical_copy_to_file.hpp" #include @@ -228,6 +229,22 @@ class CopyToFunctionLocalState : public LocalSinkState { append_count = 0; } + void SetDataWithoutPartitions(DataChunk &chunk, const DataChunk &source, const vector &col_types, + const vector &part_cols) { + D_ASSERT(source.ColumnCount() == col_types.size()); + auto types = LogicalCopyToFile::GetTypesWithoutPartitions(col_types, part_cols, false); + chunk.InitializeEmpty(types); + set part_col_set(part_cols.begin(), part_cols.end()); + idx_t new_col_id = 0; + for (idx_t col_idx = 0; col_idx < source.ColumnCount(); col_idx++) { + if (part_col_set.find(col_idx) == part_col_set.end()) { + chunk.data[new_col_id].Reference(source.data[col_idx]); + new_col_id++; + } + } + chunk.SetCardinality(source.size()); + } + void FlushPartitions(ExecutionContext &context, const PhysicalCopyToFile &op, CopyToFunctionGlobalState &g) { if (!part_buffer) { return; @@ -247,7 +264,14 @@ class CopyToFunctionLocalState : public LocalSinkState { auto local_copy_state = op.function.copy_to_initialize_local(context, *op.bind_data); // push the chunks into the write state for (auto &chunk : partitions[i]->Chunks()) { - op.function.copy_to_sink(context, *op.bind_data, *info.global_state, *local_copy_state, chunk); + if (op.write_partition_columns) { + op.function.copy_to_sink(context, *op.bind_data, *info.global_state, *local_copy_state, chunk); + } else { + DataChunk filtered_chunk; + SetDataWithoutPartitions(filtered_chunk, chunk, op.expected_types, op.partition_columns); + op.function.copy_to_sink(context, *op.bind_data, *info.global_state, *local_copy_state, + filtered_chunk); + } } op.function.copy_to_combine(context, *op.bind_data, *info.global_state, *local_copy_state); local_copy_state.reset(); diff --git a/src/execution/physical_plan/plan_copy_to_file.cpp b/src/execution/physical_plan/plan_copy_to_file.cpp index 8c26a0ea0eb..a2981f61f17 100644 --- a/src/execution/physical_plan/plan_copy_to_file.cpp +++ b/src/execution/physical_plan/plan_copy_to_file.cpp @@ -39,6 +39,7 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile copy->return_type = op.return_type; return std::move(copy); } + // COPY from select statement to file auto copy = make_uniq(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality); copy->file_path = op.file_path; @@ -54,6 +55,7 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile copy->return_type = op.return_type; copy->partition_output = op.partition_output; copy->partition_columns = op.partition_columns; + copy->write_partition_columns = op.write_partition_columns; copy->names = op.names; copy->expected_types = op.expected_types; copy->parallel = mode == CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE; diff --git a/src/execution/window_executor.cpp b/src/execution/window_executor.cpp index 3b56b04a064..35bc7d97add 100644 --- a/src/execution/window_executor.cpp +++ b/src/execution/window_executor.cpp @@ -1682,9 +1682,10 @@ void WindowLeadLagExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate, i += width; row_idx += width; } else if (wexpr.default_expr) { - llstate.leadlag_default.CopyCell(result, i, delta); - i += delta; - row_idx += delta; + const auto width = MinValue(delta, count - i); + llstate.leadlag_default.CopyCell(result, i, width); + i += width; + row_idx += width; } else { for (idx_t nulls = MinValue(delta, count - i); nulls--; ++i, ++row_idx) { FlatVector::SetNull(result, i, true); diff --git a/src/function/scalar/list/list_extract.cpp b/src/function/scalar/list/list_extract.cpp index aa8278ce63c..bc2f081c0cd 100644 --- a/src/function/scalar/list/list_extract.cpp +++ b/src/function/scalar/list/list_extract.cpp @@ -1,160 +1,40 @@ #include "duckdb/common/pair.hpp" #include "duckdb/common/string_util.hpp" - #include "duckdb/common/types/data_chunk.hpp" #include "duckdb/common/uhugeint.hpp" #include "duckdb/common/vector_operations/binary_executor.hpp" #include "duckdb/function/scalar/nested_functions.hpp" #include "duckdb/function/scalar/string_functions.hpp" #include "duckdb/parser/expression/bound_expression.hpp" -#include "duckdb/planner/expression/bound_function_expression.hpp" #include "duckdb/planner/expression/bound_cast_expression.hpp" +#include "duckdb/planner/expression/bound_function_expression.hpp" #include "duckdb/storage/statistics/list_stats.hpp" namespace duckdb { -template -void ListExtractTemplate(idx_t count, UnifiedVectorFormat &list_data, UnifiedVectorFormat &offsets_data, - Vector &child_vector, idx_t list_size, Vector &result) { - UnifiedVectorFormat child_format; - child_vector.ToUnifiedFormat(list_size, child_format); - - T *result_data; - - result.SetVectorType(VectorType::FLAT_VECTOR); - if (!VALIDITY_ONLY) { - result_data = FlatVector::GetData(result); +static optional_idx TryGetChildOffset(const list_entry_t &list_entry, const int64_t offset) { + // 1-based indexing + if (offset == 0) { + return optional_idx::Invalid(); } - auto &result_mask = FlatVector::Validity(result); - // heap-ref once - if (HEAP_REF) { - StringVector::AddHeapReference(result, child_vector); + const auto index_offset = (offset > 0) ? offset - 1 : offset; + if (index_offset < 0) { + const auto signed_list_length = UnsafeNumericCast(list_entry.length); + if (signed_list_length + index_offset < 0) { + return optional_idx::Invalid(); + } + return optional_idx(list_entry.offset + UnsafeNumericCast(signed_list_length + index_offset)); } - // this is lifted from ExecuteGenericLoop because we can't push the list child data into this otherwise - // should have gone with GetValue perhaps - auto child_data = UnifiedVectorFormat::GetData(child_format); - for (idx_t i = 0; i < count; i++) { - auto list_index = list_data.sel->get_index(i); - auto offsets_index = offsets_data.sel->get_index(i); - if (!list_data.validity.RowIsValid(list_index)) { - result_mask.SetInvalid(i); - continue; - } - if (!offsets_data.validity.RowIsValid(offsets_index)) { - result_mask.SetInvalid(i); - continue; - } - auto list_entry = (UnifiedVectorFormat::GetData(list_data))[list_index]; - auto offsets_entry = (UnifiedVectorFormat::GetData(offsets_data))[offsets_index]; + const auto unsigned_offset = UnsafeNumericCast(index_offset); - // 1-based indexing - if (offsets_entry == 0) { - result_mask.SetInvalid(i); - continue; - } - offsets_entry = (offsets_entry > 0) ? offsets_entry - 1 : offsets_entry; - - idx_t child_offset; - if (offsets_entry < 0) { - if (offsets_entry < -int64_t(list_entry.length)) { - result_mask.SetInvalid(i); - continue; - } - child_offset = UnsafeNumericCast(UnsafeNumericCast(list_entry.offset + list_entry.length) + - offsets_entry); - } else { - if ((idx_t)offsets_entry >= list_entry.length) { - result_mask.SetInvalid(i); - continue; - } - child_offset = UnsafeNumericCast(UnsafeNumericCast(list_entry.offset) + offsets_entry); - } - auto child_index = child_format.sel->get_index(child_offset); - if (child_format.validity.RowIsValid(child_index)) { - if (!VALIDITY_ONLY) { - result_data[i] = child_data[child_index]; - } - } else { - result_mask.SetInvalid(i); - } - } - if (count == 1) { - result.SetVectorType(VectorType::CONSTANT_VECTOR); + // Check that the offset is within the list + if (unsigned_offset >= list_entry.length) { + return optional_idx::Invalid(); } -} -static void ExecuteListExtractInternal(const idx_t count, UnifiedVectorFormat &list, UnifiedVectorFormat &offsets, - Vector &child_vector, idx_t list_size, Vector &result) { - D_ASSERT(child_vector.GetType() == result.GetType()); - switch (result.GetType().InternalType()) { - case PhysicalType::BOOL: - case PhysicalType::INT8: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::INT16: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::INT32: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::INT64: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::INT128: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::UINT8: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::UINT16: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::UINT32: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::UINT64: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::UINT128: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::FLOAT: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::DOUBLE: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::VARCHAR: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::INTERVAL: - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - case PhysicalType::STRUCT: { - auto &entries = StructVector::GetEntries(child_vector); - auto &result_entries = StructVector::GetEntries(result); - D_ASSERT(entries.size() == result_entries.size()); - // extract the child entries of the struct - for (idx_t i = 0; i < entries.size(); i++) { - ExecuteListExtractInternal(count, list, offsets, *entries[i], list_size, *result_entries[i]); - } - // extract the validity mask - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - } - case PhysicalType::LIST: { - // nested list: we have to reference the child - auto &child_child_list = ListVector::GetEntry(child_vector); - ListVector::GetEntry(result).Reference(child_child_list); - ListVector::SetListSize(result, ListVector::GetListSize(child_vector)); - ListExtractTemplate(count, list, offsets, child_vector, list_size, result); - break; - } - default: - throw NotImplementedException("Unimplemented type for LIST_EXTRACT"); - } + return optional_idx(list_entry.offset + unsigned_offset); } static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, const idx_t count) { @@ -164,8 +44,52 @@ static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, co list.ToUnifiedFormat(count, list_data); offsets.ToUnifiedFormat(count, offsets_data); - ExecuteListExtractInternal(count, list_data, offsets_data, ListVector::GetEntry(list), - ListVector::GetListSize(list), result); + + const auto list_ptr = UnifiedVectorFormat::GetData(list_data); + const auto offsets_ptr = UnifiedVectorFormat::GetData(offsets_data); + + UnifiedVectorFormat child_data; + auto &child_vector = ListVector::GetEntry(list); + auto child_count = ListVector::GetListSize(list); + child_vector.ToUnifiedFormat(child_count, child_data); + + SelectionVector sel(count); + vector invalid_offsets; + + for (idx_t i = 0; i < count; i++) { + const auto list_index = list_data.sel->get_index(i); + const auto offsets_index = offsets_data.sel->get_index(i); + + if (!list_data.validity.RowIsValid(list_index) || !offsets_data.validity.RowIsValid(offsets_index)) { + invalid_offsets.push_back(i); + sel.set_index(i, 0); + continue; + } + + const auto child_offset = TryGetChildOffset(list_ptr[list_index], offsets_ptr[offsets_index]); + + if (!child_offset.IsValid()) { + invalid_offsets.push_back(i); + sel.set_index(i, 0); + continue; + } + + const auto child_idx = child_data.sel->get_index(child_offset.GetIndex()); + sel.set_index(i, child_idx); + } + + VectorOperations::Copy(child_vector, result, sel, count, 0, 0); + + // Copy:ing the vectors also copies the validity mask, so we set the rows with invalid offsets (0) to false here. + for (const auto &invalid_idx : invalid_offsets) { + FlatVector::SetNull(result, invalid_idx, true); + } + + if (count == 1 || (list.GetVectorType() == VectorType::CONSTANT_VECTOR && + offsets.GetVectorType() == VectorType::CONSTANT_VECTOR)) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); + } + result.Verify(count); } @@ -180,13 +104,6 @@ static void ListExtractFunction(DataChunk &args, ExpressionState &state, Vector D_ASSERT(args.ColumnCount() == 2); auto count = args.size(); - result.SetVectorType(VectorType::CONSTANT_VECTOR); - for (idx_t i = 0; i < args.ColumnCount(); i++) { - if (args.data[i].GetVectorType() != VectorType::CONSTANT_VECTOR) { - result.SetVectorType(VectorType::FLAT_VECTOR); - } - } - Vector &base = args.data[0]; Vector &subscript = args.data[1]; diff --git a/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp b/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp index 8a818dadc0c..6201e671bb1 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp @@ -41,6 +41,7 @@ class PhysicalCopyToFile : public PhysicalOperator { CopyFunctionReturnType return_type; bool partition_output; + bool write_partition_columns; vector partition_columns; vector names; vector expected_types; diff --git a/src/include/duckdb/main/profiling_info.hpp b/src/include/duckdb/main/profiling_info.hpp index 2c4f27fbcd9..76d7bb9ea51 100644 --- a/src/include/duckdb/main/profiling_info.hpp +++ b/src/include/duckdb/main/profiling_info.hpp @@ -23,7 +23,13 @@ struct yyjson_mut_val; namespace duckdb { -enum class MetricsType : uint8_t { CPU_TIME, EXTRA_INFO, OPERATOR_CARDINALITY, OPERATOR_TIMING }; +enum class MetricsType : uint8_t { + CPU_TIME, + EXTRA_INFO, + CUMULATIVE_CARDINALITY, + OPERATOR_CARDINALITY, + OPERATOR_TIMING +}; struct MetricsTypeHashFunction { uint64_t operator()(const MetricsType &index) const { @@ -32,38 +38,19 @@ struct MetricsTypeHashFunction { }; typedef unordered_set profiler_settings_t; - -struct SettingSetFunctions { - static bool Enabled(const profiler_settings_t &settings, const MetricsType setting) { - if (settings.find(setting) != settings.end()) { - return true; - } - if (setting == MetricsType::OPERATOR_TIMING && Enabled(settings, MetricsType::CPU_TIME)) { - return true; - } - return false; - } -}; - -struct Metrics { - double cpu_time; - InsertionOrderPreservingMap extra_info; - idx_t operator_cardinality; - double operator_timing; - - Metrics() : cpu_time(0), operator_cardinality(0), operator_timing(0) { - } -}; +typedef unordered_map profiler_metrics_t; class ProfilingInfo { public: // set of metrics with their values; only enabled metrics are present in the set profiler_settings_t settings; - Metrics metrics; + profiler_metrics_t metrics; + InsertionOrderPreservingMap extra_info; public: ProfilingInfo() = default; explicit ProfilingInfo(profiler_settings_t &n_settings) : settings(n_settings) { + ResetMetrics(); } ProfilingInfo(ProfilingInfo &) = default; ProfilingInfo &operator=(ProfilingInfo const &) = default; @@ -84,5 +71,28 @@ class ProfilingInfo { public: string GetMetricAsString(MetricsType setting) const; void WriteMetricsToJSON(duckdb_yyjson::yyjson_mut_doc *doc, duckdb_yyjson::yyjson_mut_val *destination); + +public: + template + METRIC_TYPE &GetMetricValue(const MetricsType setting) { + return metrics[setting].GetValue(); + } + + template + void AddToMetric(const MetricsType setting, const Value &value) { + D_ASSERT(!metrics[setting].IsNull()); + if (metrics.find(setting) == metrics.end()) { + metrics[setting] = value; + return; + } + auto new_value = metrics[setting].GetValue() + value.GetValue(); + metrics[setting] = Value::CreateValue(new_value); + } + + template + void AddToMetric(const MetricsType setting, const METRIC_TYPE &value) { + auto new_value = Value::CreateValue(value); + return AddToMetric(setting, new_value); + } }; } // namespace duckdb diff --git a/src/include/duckdb/main/query_profiler.hpp b/src/include/duckdb/main/query_profiler.hpp index 3a433db3599..2bfe7bca0ad 100644 --- a/src/include/duckdb/main/query_profiler.hpp +++ b/src/include/duckdb/main/query_profiler.hpp @@ -66,9 +66,7 @@ class OperatorProfiler { int id); DUCKDB_API OperatorInformation &GetOperatorInfo(const PhysicalOperator &phys_op); - static bool SettingEnabled(const MetricsType setting) { - return SettingSetFunctions::Enabled(ProfilingInfo::DefaultSettings(), setting); - } + bool SettingIsEnabled(MetricsType metric) const; ~OperatorProfiler() { } diff --git a/src/include/duckdb/planner/operator/logical_copy_to_file.hpp b/src/include/duckdb/planner/operator/logical_copy_to_file.hpp index 3c4a791d387..5ec454b0e01 100644 --- a/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +++ b/src/include/duckdb/planner/operator/logical_copy_to_file.hpp @@ -41,6 +41,7 @@ class LogicalCopyToFile : public LogicalOperator { CopyFunctionReturnType return_type; bool partition_output; + bool write_partition_columns; vector partition_columns; vector names; vector expected_types; @@ -50,6 +51,10 @@ class LogicalCopyToFile : public LogicalOperator { idx_t EstimateCardinality(ClientContext &context) override; void Serialize(Serializer &serializer) const override; static unique_ptr Deserialize(Deserializer &deserializer); + static vector GetTypesWithoutPartitions(const vector &col_types, + const vector &part_cols, bool write_part_cols); + static vector GetNamesWithoutPartitions(const vector &col_names, const vector &part_cols, + bool write_part_cols); protected: void ResolveTypes() override { diff --git a/src/main/profiling_info.cpp b/src/main/profiling_info.cpp index 53cbdcd73db..9f048162d08 100644 --- a/src/main/profiling_info.cpp +++ b/src/main/profiling_info.cpp @@ -21,6 +21,7 @@ profiler_settings_t ProfilingInfo::DefaultSettings() { return { MetricsType::CPU_TIME, MetricsType::EXTRA_INFO, + MetricsType::CUMULATIVE_CARDINALITY, MetricsType::OPERATOR_CARDINALITY, MetricsType::OPERATOR_TIMING, }; @@ -32,7 +33,30 @@ void ProfilingInfo::ResetSettings() { } void ProfilingInfo::ResetMetrics() { - metrics = Metrics(); + metrics.clear(); + + auto default_settings = DefaultSettings(); + + for (auto &metric : default_settings) { + if (!Enabled(metric)) { + continue; + } + + switch (metric) { + case MetricsType::CPU_TIME: + case MetricsType::OPERATOR_TIMING: { + metrics[metric] = Value::CreateValue(0.0); + break; + } + case MetricsType::CUMULATIVE_CARDINALITY: + case MetricsType::OPERATOR_CARDINALITY: { + metrics[metric] = Value::CreateValue(0); + break; + } + case MetricsType::EXTRA_INFO: + break; + } + } } bool ProfilingInfo::Enabled(const MetricsType setting) const { @@ -42,16 +66,20 @@ bool ProfilingInfo::Enabled(const MetricsType setting) const { if (setting == MetricsType::OPERATOR_TIMING && Enabled(MetricsType::CPU_TIME)) { return true; } + if (setting == MetricsType::OPERATOR_CARDINALITY && Enabled(MetricsType::CUMULATIVE_CARDINALITY)) { + return true; + } return false; } string ProfilingInfo::GetMetricAsString(MetricsType setting) const { - switch (setting) { - case MetricsType::CPU_TIME: - return to_string(metrics.cpu_time); - case MetricsType::EXTRA_INFO: { + if (!Enabled(setting)) { + throw InternalException("Metric %s not enabled", EnumUtil::ToString(setting)); + } + + if (setting == MetricsType::EXTRA_INFO) { string result; - for (auto &it : QueryProfiler::JSONSanitize(metrics.extra_info)) { + for (auto &it : extra_info) { if (!result.empty()) { result += ", "; } @@ -59,24 +87,23 @@ string ProfilingInfo::GetMetricAsString(MetricsType setting) const { } return "\"" + result + "\""; } - case MetricsType::OPERATOR_CARDINALITY: - return to_string(metrics.operator_cardinality); - case MetricsType::OPERATOR_TIMING: - return to_string(metrics.operator_timing); - default: - throw NotImplementedException("MetricsType %s not implemented", EnumUtil::ToString(setting)); - } + + // The metric cannot be NULL, and should have been 0 initialized. + D_ASSERT(!metrics.at(setting).IsNull()); + + return metrics.at(setting).ToString(); } void ProfilingInfo::WriteMetricsToJSON(yyjson_mut_doc *doc, yyjson_mut_val *dest) { for (auto &metric : settings) { - switch (metric) { - case MetricsType::CPU_TIME: - yyjson_mut_obj_add_real(doc, dest, "cpu_time", metrics.cpu_time); - break; - case MetricsType::EXTRA_INFO: { - auto extra_info = yyjson_mut_obj(doc); - for (auto &it : metrics.extra_info) { + auto metric_str = StringUtil::Lower(EnumUtil::ToString(metric)); + auto key_val = yyjson_mut_strcpy(doc, metric_str.c_str()); + auto key_ptr = yyjson_mut_get_str(key_val); + + if (metric == MetricsType::EXTRA_INFO) { + auto extra_info_obj = yyjson_mut_obj(doc); + + for (auto &it : extra_info) { auto &key = it.first; auto &value = it.second; auto splits = StringUtil::Split(value, "\n"); @@ -85,20 +112,27 @@ void ProfilingInfo::WriteMetricsToJSON(yyjson_mut_doc *doc, yyjson_mut_val *dest for (auto &split : splits) { yyjson_mut_arr_add_strcpy(doc, list_items, split.c_str()); } - yyjson_mut_obj_add_val(doc, extra_info, key.c_str(), list_items); + yyjson_mut_obj_add_val(doc, extra_info_obj, key.c_str(), list_items); } else { - yyjson_mut_obj_add_strcpy(doc, extra_info, key.c_str(), value.c_str()); + yyjson_mut_obj_add_strcpy(doc, extra_info_obj, key.c_str(), value.c_str()); } } - yyjson_mut_obj_add_val(doc, dest, "extra_info", extra_info); - break; + yyjson_mut_obj_add_val(doc, dest, key_ptr, extra_info_obj); + continue; } - case MetricsType::OPERATOR_CARDINALITY: { - yyjson_mut_obj_add_uint(doc, dest, "operator_cardinality", metrics.operator_cardinality); + + // The metric cannot be NULL, and should have been 0 initialized. + D_ASSERT(!metrics[metric].IsNull()); + + switch (metric) { + case MetricsType::CPU_TIME: + case MetricsType::OPERATOR_TIMING: { + yyjson_mut_obj_add_real(doc, dest, key_ptr, metrics[metric].GetValue()); break; } - case MetricsType::OPERATOR_TIMING: { - yyjson_mut_obj_add_real(doc, dest, "operator_timing", metrics.operator_timing); + case MetricsType::CUMULATIVE_CARDINALITY: + case MetricsType::OPERATOR_CARDINALITY: { + yyjson_mut_obj_add_uint(doc, dest, key_ptr, metrics[metric].GetValue()); break; } default: diff --git a/src/main/query_profiler.cpp b/src/main/query_profiler.cpp index 9b1fe0da893..6c4b0a3684c 100644 --- a/src/main/query_profiler.cpp +++ b/src/main/query_profiler.cpp @@ -136,8 +136,9 @@ void QueryProfiler::Finalize(ProfilingNode &node) { Finalize(*child); if (op_node.type == PhysicalOperatorType::UNION && op_node.GetProfilingInfo().Enabled(MetricsType::OPERATOR_CARDINALITY)) { - op_node.GetProfilingInfo().metrics.operator_cardinality += - child->GetProfilingInfo().metrics.operator_cardinality; + op_node.GetProfilingInfo().AddToMetric( + MetricsType::OPERATOR_CARDINALITY, + child->GetProfilingInfo().metrics[MetricsType::OPERATOR_CARDINALITY].GetValue()); } } } @@ -146,14 +147,14 @@ void QueryProfiler::StartExplainAnalyze() { this->is_explain_analyze = true; } -static void GetTotalCPUTime(ProfilingNode &node) { - node.GetProfilingInfo().metrics.cpu_time = node.GetProfilingInfo().metrics.operator_timing; - if (node.GetChildCount() > 0) { - for (idx_t i = 0; i < node.GetChildCount(); i++) { - auto child = node.GetChild(i); - GetTotalCPUTime(*child); - node.GetProfilingInfo().metrics.cpu_time += child->GetProfilingInfo().metrics.cpu_time; - } +template +static void GetCumulativeMetric(ProfilingNode &node, MetricsType cumulative_metric, MetricsType child_metric) { + node.GetProfilingInfo().metrics[cumulative_metric] = node.GetProfilingInfo().metrics[child_metric]; + for (idx_t i = 0; i < node.GetChildCount(); i++) { + auto child = node.GetChild(i); + GetCumulativeMetric(*child, cumulative_metric, child_metric); + node.GetProfilingInfo().AddToMetric( + cumulative_metric, child->GetProfilingInfo().metrics[cumulative_metric].GetValue()); } } @@ -177,10 +178,14 @@ void QueryProfiler::EndQuery() { query_info.query = query; query_info.GetProfilingInfo() = ProfilingInfo(ClientConfig::GetConfig(context).profiler_settings); if (query_info.GetProfilingInfo().Enabled(MetricsType::OPERATOR_TIMING)) { - query_info.GetProfilingInfo().metrics.operator_timing = main_query.Elapsed(); + query_info.GetProfilingInfo().metrics[MetricsType::OPERATOR_TIMING] = main_query.Elapsed(); } if (query_info.GetProfilingInfo().Enabled(MetricsType::CPU_TIME)) { - GetTotalCPUTime(*root); + GetCumulativeMetric(*root, MetricsType::CPU_TIME, MetricsType::OPERATOR_TIMING); + } + if (query_info.GetProfilingInfo().Enabled(MetricsType::CUMULATIVE_CARDINALITY)) { + GetCumulativeMetric(*root, MetricsType::CUMULATIVE_CARDINALITY, + MetricsType::OPERATOR_CARDINALITY); } } @@ -263,6 +268,19 @@ OperatorProfiler::OperatorProfiler(ClientContext &context) { settings = ClientConfig::GetConfig(context).profiler_settings; } +bool OperatorProfiler::SettingIsEnabled(MetricsType metric) const { + if (settings.find(metric) != settings.end()) { + return true; + } + if (metric == MetricsType::OPERATOR_TIMING && SettingIsEnabled(MetricsType::CPU_TIME)) { + return true; + } + if (metric == MetricsType::OPERATOR_CARDINALITY && SettingIsEnabled(MetricsType::CUMULATIVE_CARDINALITY)) { + return true; + } + return false; +} + void OperatorProfiler::StartOperator(optional_ptr phys_op) { if (!enabled) { return; @@ -275,7 +293,7 @@ void OperatorProfiler::StartOperator(optional_ptr phys_o active_operator = phys_op; // start timing for current element - if (SettingEnabled(MetricsType::OPERATOR_TIMING)) { + if (SettingIsEnabled(MetricsType::OPERATOR_TIMING)) { op.Start(); } } @@ -289,8 +307,8 @@ void OperatorProfiler::EndOperator(optional_ptr chunk) { throw InternalException("OperatorProfiler: Attempting to call EndOperator while another operator is active"); } - bool timing_enabled = SettingEnabled(MetricsType::OPERATOR_TIMING); - bool cardinality_enabled = SettingEnabled(MetricsType::OPERATOR_CARDINALITY); + bool timing_enabled = SettingIsEnabled(MetricsType::OPERATOR_TIMING); + bool cardinality_enabled = SettingIsEnabled(MetricsType::OPERATOR_CARDINALITY); if (timing_enabled || cardinality_enabled) { // get the operator info for the current element auto &curr_operator_info = GetOperatorInfo(*active_operator); @@ -339,11 +357,11 @@ void QueryProfiler::Flush(OperatorProfiler &profiler) { D_ASSERT(entry != tree_map.end()); auto &tree_node = entry->second.get(); - if (profiler.SettingEnabled(MetricsType::OPERATOR_TIMING)) { - tree_node.GetProfilingInfo().metrics.operator_timing += node.second.time; + if (tree_node.GetProfilingInfo().Enabled(MetricsType::OPERATOR_TIMING)) { + tree_node.GetProfilingInfo().AddToMetric(MetricsType::OPERATOR_TIMING, node.second.time); } - if (profiler.SettingEnabled(MetricsType::OPERATOR_CARDINALITY)) { - tree_node.GetProfilingInfo().metrics.operator_cardinality += node.second.elements; + if (tree_node.GetProfilingInfo().Enabled(MetricsType::OPERATOR_CARDINALITY)) { + tree_node.GetProfilingInfo().AddToMetric(MetricsType::OPERATOR_CARDINALITY, node.second.elements); } } profiler.timings.clear(); @@ -604,7 +622,7 @@ unique_ptr QueryProfiler::CreateTree(const PhysicalOperator &root node->depth = depth; node->GetProfilingInfo() = ProfilingInfo(settings); if (node->GetProfilingInfo().Enabled(MetricsType::EXTRA_INFO)) { - node->GetProfilingInfo().metrics.extra_info = root.ParamsToString(); + node->GetProfilingInfo().extra_info = root.ParamsToString(); } tree_map.insert(make_pair(reference(root), reference(*node))); auto children = root.GetChildren(); diff --git a/src/planner/binder/statement/bind_copy.cpp b/src/planner/binder/statement/bind_copy.cpp index 901ad4e946c..d95db2e0a61 100644 --- a/src/planner/binder/statement/bind_copy.cpp +++ b/src/planner/binder/statement/bind_copy.cpp @@ -61,6 +61,7 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) vector partition_cols; bool seen_overwrite_mode = false; bool seen_filepattern = false; + bool write_partition_columns = false; CopyFunctionReturnType return_type = CopyFunctionReturnType::CHANGED_ROWS; CopyFunctionBindInput bind_input(*stmt.info); @@ -126,6 +127,8 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) if (GetBooleanArg(context, option.second)) { return_type = CopyFunctionReturnType::CHANGED_ROWS_AND_FILE_LIST; } + } else if (loption == "write_partition_columns") { + write_partition_columns = true; } else { stmt.info->options[option.first] = option.second; } @@ -148,6 +151,12 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) if (file_size_bytes.IsValid() && !partition_cols.empty()) { throw NotImplementedException("Can't combine FILE_SIZE_BYTES and PARTITION_BY for COPY"); } + if (!write_partition_columns) { + if (partition_cols.size() == select_node.names.size()) { + throw NotImplementedException("No column to write as all columns are specified as partition columns. " + "WRITE_PARTITION_COLUMNS option can be used to write partition columns."); + } + } bool is_remote_file = FileSystem::IsRemoteFile(stmt.info->file_path); if (is_remote_file) { use_tmp_file = false; @@ -198,8 +207,11 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) QueryResult::DeduplicateColumns(unique_column_names); auto file_path = stmt.info->file_path; - auto function_data = - copy_function.function.copy_to_bind(context, bind_input, unique_column_names, select_node.types); + auto names_to_write = + LogicalCopyToFile::GetNamesWithoutPartitions(unique_column_names, partition_cols, write_partition_columns); + auto types_to_write = + LogicalCopyToFile::GetTypesWithoutPartitions(select_node.types, partition_cols, write_partition_columns); + auto function_data = copy_function.function.copy_to_bind(context, bind_input, names_to_write, types_to_write); const auto rotate = copy_function.function.rotate_files && copy_function.function.rotate_files(*function_data, file_size_bytes); @@ -230,6 +242,7 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) } copy->rotate = rotate; copy->partition_output = !partition_cols.empty(); + copy->write_partition_columns = write_partition_columns; copy->partition_columns = std::move(partition_cols); copy->return_type = return_type; diff --git a/src/planner/operator/logical_copy_to_file.cpp b/src/planner/operator/logical_copy_to_file.cpp index 6b6e2116084..bac840fce2b 100644 --- a/src/planner/operator/logical_copy_to_file.cpp +++ b/src/planner/operator/logical_copy_to_file.cpp @@ -8,6 +8,36 @@ namespace duckdb { +vector LogicalCopyToFile::GetTypesWithoutPartitions(const vector &col_types, + const vector &part_cols, bool write_part_cols) { + if (write_part_cols || part_cols.empty()) { + return col_types; + } + vector types; + set part_col_set(part_cols.begin(), part_cols.end()); + for (idx_t col_idx = 0; col_idx < col_types.size(); col_idx++) { + if (part_col_set.find(col_idx) == part_col_set.end()) { + types.push_back(col_types[col_idx]); + } + } + return types; +} + +vector LogicalCopyToFile::GetNamesWithoutPartitions(const vector &col_names, + const vector &part_cols, bool write_part_cols) { + if (write_part_cols || part_cols.empty()) { + return col_names; + } + vector names; + set part_col_set(part_cols.begin(), part_cols.end()); + for (idx_t col_idx = 0; col_idx < col_names.size(); col_idx++) { + if (part_col_set.find(col_idx) == part_col_set.end()) { + names.push_back(col_names[col_idx]); + } + } + return names; +} + void LogicalCopyToFile::Serialize(Serializer &serializer) const { LogicalOperator::Serialize(serializer); serializer.WriteProperty(200, "file_path", file_path); @@ -35,6 +65,7 @@ void LogicalCopyToFile::Serialize(Serializer &serializer) const { serializer.WriteProperty(213, "file_extension", file_extension); serializer.WriteProperty(214, "rotate", rotate); serializer.WriteProperty(215, "return_type", return_type); + serializer.WriteProperty(216, "write_partition_columns", write_partition_columns); } unique_ptr LogicalCopyToFile::Deserialize(Deserializer &deserializer) { @@ -68,14 +99,6 @@ unique_ptr LogicalCopyToFile::Deserialize(Deserializer &deseria // Just deserialize the bind data deserializer.ReadObject(212, "function_data", [&](Deserializer &obj) { bind_data = function.deserialize(obj, function); }); - } else { - // Otherwise, re-bind with the copy info - if (!function.copy_to_bind) { - throw InternalException("Copy function \"%s\" has neither bind nor (de)serialize", function.name); - } - - CopyFunctionBindInput function_bind_input(*copy_info); - bind_data = function.copy_to_bind(context, function_bind_input, names, expected_types); } auto default_extension = function.extension; @@ -85,6 +108,19 @@ unique_ptr LogicalCopyToFile::Deserialize(Deserializer &deseria auto rotate = deserializer.ReadPropertyWithDefault(214, "rotate", false); auto return_type = deserializer.ReadPropertyWithDefault(215, "return_type", CopyFunctionReturnType::CHANGED_ROWS); + auto write_partition_columns = deserializer.ReadProperty(216, "write_partition_columns"); + + if (!has_serialize) { + // If not serialized, re-bind with the copy info + if (!function.copy_to_bind) { + throw InternalException("Copy function \"%s\" has neither bind nor (de)serialize", function.name); + } + + CopyFunctionBindInput function_bind_input(*copy_info); + auto names_to_write = GetNamesWithoutPartitions(names, partition_columns, write_partition_columns); + auto types_to_write = GetTypesWithoutPartitions(expected_types, partition_columns, write_partition_columns); + bind_data = function.copy_to_bind(context, function_bind_input, names_to_write, types_to_write); + } auto result = make_uniq(function, std::move(bind_data), std::move(copy_info)); result->file_path = file_path; @@ -99,6 +135,7 @@ unique_ptr LogicalCopyToFile::Deserialize(Deserializer &deseria result->expected_types = expected_types; result->rotate = rotate; result->return_type = return_type; + result->write_partition_columns = write_partition_columns; return std::move(result); } diff --git a/test/api/capi/test_capi_profiling.cpp b/test/api/capi/test_capi_profiling.cpp index dc63e141128..79aa41d9e36 100644 --- a/test/api/capi/test_capi_profiling.cpp +++ b/test/api/capi/test_capi_profiling.cpp @@ -3,7 +3,7 @@ using namespace duckdb; using namespace std; -string BuildProfilingSettingsString(const std::vector &settings) { +string BuildProfilingSettingsString(const duckdb::vector &settings) { string result = "'{"; for (idx_t i = 0; i < settings.size(); i++) { result += "\"" + settings[i] + "\": \"true\""; @@ -15,7 +15,9 @@ string BuildProfilingSettingsString(const std::vector &settings) { return result; } -void RetrieveAllMetrics(duckdb_profiling_info profiling_info, const std::vector &settings) { +void RetrieveAllMetrics(duckdb_profiling_info profiling_info, const duckdb::vector &settings, + duckdb::map &cumulative_counter, + duckdb::map &cumulative_result) { for (idx_t i = 0; i < settings.size(); i++) { auto value = duckdb_profiling_info_get_value(profiling_info, settings[i].c_str()); if (value != nullptr) { @@ -32,6 +34,15 @@ void RetrieveAllMetrics(duckdb_profiling_info profiling_info, const std::vector< REQUIRE(false); } + if (cumulative_counter.find(settings[i]) != cumulative_counter.end()) { + cumulative_counter[settings[i]] += result; + } + + // only take the root node's result + if (cumulative_result.find(settings[i]) != cumulative_result.end() && cumulative_result[settings[i]] == 0) { + cumulative_result[settings[i]] = result; + } + duckdb_destroy_value(&value); REQUIRE(result >= 0); } @@ -39,7 +50,9 @@ void RetrieveAllMetrics(duckdb_profiling_info profiling_info, const std::vector< } // Traverse the tree and retrieve all metrics -void TraverseTree(duckdb_profiling_info profiling_info, const std::vector &settings, bool is_root = true) { +void TraverseTree(duckdb_profiling_info profiling_info, const duckdb::vector &settings, + duckdb::map &cumulative_counter, duckdb::map &cumulative_result, + bool is_root = true) { if (is_root) { // At the root, only the query name is available auto query = duckdb_profiling_info_get_query(profiling_info); @@ -56,15 +69,19 @@ void TraverseTree(duckdb_profiling_info profiling_info, const std::vector(value * 1000); +} + TEST_CASE("Test Profiling with Single Metric", "[capi]") { CAPITester tester; duckdb::unique_ptr result; @@ -75,7 +92,7 @@ TEST_CASE("Test Profiling with Single Metric", "[capi]") { REQUIRE_NO_FAIL(tester.Query("PRAGMA enable_profiling = 'no_output'")); // test only CPU_TIME profiling - std::vector settings = {"CPU_TIME"}; + duckdb::vector settings = {"CPU_TIME"}; REQUIRE_NO_FAIL(tester.Query("PRAGMA custom_profiling_settings=" + BuildProfilingSettingsString(settings))); REQUIRE_NO_FAIL(tester.Query("SELECT 42")); @@ -86,7 +103,10 @@ TEST_CASE("Test Profiling with Single Metric", "[capi]") { // Retrieve metric that is not enabled REQUIRE(duckdb_profiling_info_get_value(profiling_info, "EXTRA_INFO") == nullptr); - TraverseTree(profiling_info, settings); + duckdb::map cumulative_counter; + duckdb::map cumulative_result; + + TraverseTree(profiling_info, settings, cumulative_counter, cumulative_result); // Cleanup tester.Cleanup(); @@ -102,7 +122,8 @@ TEST_CASE("Test Profiling with All Metrics", "[capi]") { REQUIRE_NO_FAIL(tester.Query("PRAGMA enable_profiling = 'no_output'")); // test all profiling metrics - std::vector settings = {"CPU_TIME", "EXTRA_INFO", "OPERATOR_CARDINALITY", "OPERATOR_TIMING"}; + duckdb::vector settings = {"CPU_TIME", "CUMULATIVE_CARDINALITY", "EXTRA_INFO", "OPERATOR_CARDINALITY", + "OPERATOR_TIMING"}; REQUIRE_NO_FAIL(tester.Query("PRAGMA custom_profiling_settings=" + BuildProfilingSettingsString(settings))); REQUIRE_NO_FAIL(tester.Query("SELECT 42")); @@ -110,7 +131,18 @@ TEST_CASE("Test Profiling with All Metrics", "[capi]") { auto profiling_info = duckdb_get_profiling_info(tester.connection); REQUIRE(profiling_info != nullptr); - TraverseTree(profiling_info, settings); + duckdb::map cumulative_counter = {{"OPERATOR_TIMING", 0}, {"OPERATOR_CARDINALITY", 0}}; + + duckdb::map cumulative_result { + {"CPU_TIME", 0}, + {"CUMULATIVE_CARDINALITY", 0}, + }; + + TraverseTree(profiling_info, settings, cumulative_counter, cumulative_result); + + REQUIRE(ConvertToInt(cumulative_result["CPU_TIME"]) == ConvertToInt(cumulative_counter["OPERATOR_TIMING"])); + REQUIRE(ConvertToInt(cumulative_result["CUMULATIVE_CARDINALITY"]) == + ConvertToInt(cumulative_counter["OPERATOR_CARDINALITY"])); // Cleanup tester.Cleanup(); diff --git a/test/fuzzer/sqlsmith/window-leadlag-overflow.test b/test/fuzzer/sqlsmith/window-leadlag-overflow.test index 6a7d02bbbed..b5064616920 100644 --- a/test/fuzzer/sqlsmith/window-leadlag-overflow.test +++ b/test/fuzzer/sqlsmith/window-leadlag-overflow.test @@ -10,3 +10,6 @@ SELECT lag(c16, COLUMNS(*)) OVER (ROWS BETWEEN 1768 FOLLOWING AND UNBOUNDED FOLL FROM all_types AS t42(c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c20, c21, c22, c23, c24, c25, c26, c27, c28, c29, c30, c31, c32, c33, c34, c35, c36, c37, c38, c39, c40, c41) ---- Overflow in subtraction + +statement ok +SELECT lead('1e668c84-6cbc-4d41-843e-970c17446f9e', 8479, 3087) OVER (PARTITION BY 9136) diff --git a/test/parquet/test_parquet_schema.test b/test/parquet/test_parquet_schema.test index 5a8a86e7e85..b7f41cb6ac1 100644 --- a/test/parquet/test_parquet_schema.test +++ b/test/parquet/test_parquet_schema.test @@ -40,7 +40,7 @@ statement ok COPY ( SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL SELECT 2 i1, 3 i3, 4 i4, 5 i5 -) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet) +) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS) # auto-detection of hive partitioning is enabled by default, # but automatically disabled when a schema is supplied, so this should succeed @@ -56,6 +56,15 @@ FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map { 5 3 2 1 5 3 2 2 +# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option +statement error +COPY ( + SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL + SELECT 2 i1, 3 i3, 4 i4, 5 i5 +) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet) +---- +Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names: + # cannot duplicate field_ids statement error SELECT * diff --git a/test/sql/copy/csv/test_partition_compression.test b/test/sql/copy/csv/test_partition_compression.test index c81ebcd1768..da909d007c0 100644 --- a/test/sql/copy/csv/test_partition_compression.test +++ b/test/sql/copy/csv/test_partition_compression.test @@ -22,18 +22,18 @@ No files found that match the pattern query III FROM read_csv_auto('__TEST_DIR__/data.csv.d/*/*/*.csv.gz'); ---- -a bar 1 -a foo 1 -a foo 2 -b bar 1 +1 a bar +1 a foo +2 a foo +1 b bar query III FROM read_csv_auto('__TEST_DIR__/data.csv.d/*/*/*.csv.*'); ---- -a bar 1 -a foo 1 -a foo 2 -b bar 1 +1 a bar +1 a foo +2 a foo +1 b bar require parquet @@ -49,15 +49,15 @@ No files found that match the pattern query III FROM read_csv_auto('__TEST_DIR__/data.csv.d2/*/*/*.csv.zst'); ---- -a bar 1 -a foo 1 -a foo 2 -b bar 1 +1 a bar +1 a foo +2 a foo +1 b bar query III FROM read_csv_auto('__TEST_DIR__/data.csv.d2/*/*/*.csv.*'); ---- -a bar 1 -a foo 1 -a foo 2 -b bar 1 +1 a bar +1 a foo +2 a foo +1 b bar diff --git a/test/sql/copy/csv/union_by_name_many_files.test_slow b/test/sql/copy/csv/union_by_name_many_files.test_slow index 67a0aafecc4..39e9305c1fb 100644 --- a/test/sql/copy/csv/union_by_name_many_files.test_slow +++ b/test/sql/copy/csv/union_by_name_many_files.test_slow @@ -28,7 +28,7 @@ statement ok copy (select 2000+(v//12)y,m,v,j from orders where m IN (8, 9, 10)) TO '__TEST_DIR__/many_files_${format}' (FORMAT ${format}, PARTITION_BY (y,m), APPEND); statement ok -copy (select 2000+(v//12)y,m from orders where m IN (11, 12)) TO '__TEST_DIR__/many_files_${format}' (FORMAT ${format}, PARTITION_BY (y,m), APPEND); +copy (select 2000+(v//12)y,m from orders where m IN (11, 12)) TO '__TEST_DIR__/many_files_${format}' (FORMAT ${format}, PARTITION_BY (y,m), APPEND, WRITE_PARTITION_COLUMNS); query IIIIIII SELECT COUNT(*), MIN(m), MAX(m), MIN(y), MAX(y), COUNT(v), COUNT(j) FROM read_${format}('__TEST_DIR__/many_files_${format}/**/*.${format}', union_by_name=true) diff --git a/test/sql/copy/format_uuid.test b/test/sql/copy/format_uuid.test index 47755530549..f9bfc5a2414 100644 --- a/test/sql/copy/format_uuid.test +++ b/test/sql/copy/format_uuid.test @@ -52,7 +52,7 @@ COPY test2 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PA query III SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????.parquet'; ---- -9 18 81 +18 81 9 # create a second file in the same partition dirs. This is the append workflow statement ok @@ -62,14 +62,14 @@ COPY test3 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_o query III sort SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????_trailing.parquet'; ---- -9 27 729 +27 729 9 # the partition dir contains the files from previous 2 queries query III sort SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????*.parquet'; ---- -9 18 81 -9 27 729 +18 81 9 +27 729 9 # Test without a specified format name for the outputfile. statement ok @@ -79,15 +79,15 @@ COPY test4 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_o query III sort SELECT * FROM '__TEST_DIR__/part/a=9/data_[0-9]*.parquet'; ---- -9 36 6561 +36 6561 9 # the partition dir contains the files from test[2-4] query III sort SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet'; ---- -9 18 81 -9 27 729 -9 36 6561 +18 81 9 +27 729 9 +36 6561 9 # Test where the FILENAME_PATTERN does not contain "{i}" or "{uuid}". statement ok @@ -97,16 +97,16 @@ COPY test5 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_o query III sort SELECT * FROM '__TEST_DIR__/part/a=9/basename[0-9]*.parquet'; ---- -9 45 59049 +45 59049 9 # the partition dir contains the files from test[2-5] query III sort SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet'; ---- -9 18 81 -9 27 729 -9 36 6561 -9 45 59049 +18 81 9 +27 729 9 +36 6561 9 +45 59049 9 # Test without the overwrite_or_ignore param, that tries to add a file to an existing directory statement error @@ -133,7 +133,7 @@ SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten/a=9/a_file_name*'); query III sort SELECT * FROM '__TEST_DIR__/to_be_overwritten/a=9/a_file_name*.parquet'; ---- -9 36 6561 +36 6561 9 # Test with a combination of {i} and {uuid} @@ -143,7 +143,7 @@ COPY test5 TO '__TEST_DIR__/incorrect_pos' (FORMAT PARQUET, PARTITION_BY (a), fi query III sort SELECT * FROM '__TEST_DIR__/incorrect_pos/a=5/a_name_with_????????-????-4???-????-????????????_numbers.parquet'; ---- -5 25 3125 +25 3125 5 # Test "{uuid}" with per_thread_output TRUE statement ok @@ -174,7 +174,7 @@ SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*'); query III sort SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet'; ---- -9 45 59049 +45 59049 9 # Test where overwrite_or_ignore param == true, and filename_pattern is set to an existing file statement ok @@ -188,4 +188,18 @@ SELECT COUNT(*) FROM GLOB('__TEST_DIR__/directory0/a=7/*'); query III sort SELECT * FROM '__TEST_DIR__/directory0/a=7/my_filename0.parquet'; ---- -7 35 16807 +35 16807 7 + +# When partition columns are written to files, `select *` returns in order of columns +statement ok +COPY test5 TO '__TEST_DIR__/to_be_overwritten2' (FORMAT PARQUET, PARTITION_BY (a), OVERWRITE_OR_IGNORE, WRITE_PARTITION_COLUMNS); + +query I +SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*'); +---- +1 + +query III sort +SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet'; +---- +9 45 59049 diff --git a/test/sql/copy/hive_types.test_slow b/test/sql/copy/hive_types.test_slow index 1400e5d718a..71adc1ddfc7 100644 --- a/test/sql/copy/hive_types.test_slow +++ b/test/sql/copy/hive_types.test_slow @@ -56,10 +56,12 @@ Invalid Input Error: hive_types: 'season' must be a VARCHAR, instead: 'INTEGER' # basic tests with hive_types_autocast -query III + +# when hive_partitioning=0, data won't be read from directory names unless partition columns are written to files. +statement error select typeof(season),typeof(director),typeof(aired) from read_parquet('__TEST_DIR__/partition/**/*.parquet', hive_partitioning=0) limit 1; ---- -BIGINT VARCHAR DATE +Binder Error: Referenced column "season" not found in FROM clause! query III select typeof(season),typeof(director),typeof(aired) from read_parquet('__TEST_DIR__/partition/**/*.parquet', hive_partitioning=1, hive_types_autocast=0) limit 1; @@ -104,12 +106,26 @@ select * from read_parquet('__TEST_DIR__/partition_types/**/*.parquet', hive_typ Unable to cast # Complex filter filtering first file, filter should be pruned completely if hive_partitioning=1 -query II +statement error explain from parquet_scan('__TEST_DIR__/partition/**/*.parquet', HIVE_PARTITIONING=0, HIVE_TYPES_AUTOCAST=0) where aired < '2006-1-1'; ---- -physical_plan :.*(PARQUET_SCAN.*Filters:|FILTER).* +Binder Error: Referenced column "aired" not found in FROM clause! query II explain (FORMAT JSON) from parquet_scan('__TEST_DIR__/partition/**/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where aired < '2006-1-1'; ---- -physical_plan :.*PARQUET_SCAN.*File Filters.*aired.* +physical_plan :.*(PARQUET_SCAN.*File Filters": "\(aired \<).* + +# When partition columns are written, partition columns can be read even with HIVE_PARTITIONING=0 +statement ok +copy 'data/csv/hive-partitioning/hive_types/himym.csv' to '__TEST_DIR__/partition-written' (format parquet, partition_by(season,director,aired), write_partition_columns); + +query III +select typeof(season),typeof(director),typeof(aired) from read_parquet('__TEST_DIR__/partition-written/**/*.parquet', hive_partitioning=0) limit 1; +---- +BIGINT VARCHAR DATE + +query II +explain from parquet_scan('__TEST_DIR__/partition-written/**/*.parquet', HIVE_PARTITIONING=0, HIVE_TYPES_AUTOCAST=0) where aired < '2006-1-1'; +---- +physical_plan :.*PARQUET_SCAN.*Filters:.*aired.* diff --git a/test/sql/copy/parquet/hive_partitioning_struct.test b/test/sql/copy/parquet/hive_partitioning_struct.test index 45e2adb19bd..136be3e1e55 100644 --- a/test/sql/copy/parquet/hive_partitioning_struct.test +++ b/test/sql/copy/parquet/hive_partitioning_struct.test @@ -13,7 +13,7 @@ COPY (SELECT i//50 id, {'a': i} s FROM range(100) t(i)) TO '__TEST_DIR__/hive_pa query II SELECT * FROM read_parquet('__TEST_DIR__/hive_partitioned_struct_col/**/*.parquet', hive_partitioning=1) WHERE s.a=42 ---- -0 {'a': 42} +{'a': 42} 0 # what if the hive types themselves are structs? statement ok diff --git a/test/sql/copy/parquet/hive_timestamps.test b/test/sql/copy/parquet/hive_timestamps.test index b1d76930788..6e29e0d9bc1 100644 --- a/test/sql/copy/parquet/hive_timestamps.test +++ b/test/sql/copy/parquet/hive_timestamps.test @@ -53,7 +53,7 @@ COPY ( ); query II -SELECT * +SELECT bucket, total FROM read_parquet('__TEST_DIR__/hive/*/*.parquet') ORDER BY ALL LIMIT 5 diff --git a/test/sql/copy/parquet/parquet_hive_null.test b/test/sql/copy/parquet/parquet_hive_null.test index dcd3d4f6537..3c01c0c7a1b 100644 --- a/test/sql/copy/parquet/parquet_hive_null.test +++ b/test/sql/copy/parquet/parquet_hive_null.test @@ -8,7 +8,7 @@ statement ok create table test as select i%5 as a, i%2 as b from range(0,10) tbl(i); statement ok -copy (FROM test UNION ALL select 'NULL' as a, 'NULL' as b) to '__TEST_DIR__/null-parquet' (PARTITION_BY (a,b), FORMAT 'parquet'); +copy (FROM test UNION ALL select 'NULL' as a, 'NULL' as b) to '__TEST_DIR__/null-parquet' (PARTITION_BY (a,b), FORMAT 'parquet', WRITE_PARTITION_COLUMNS); query II select * @@ -26,3 +26,26 @@ ORDER BY ALL 4 0 4 1 NULL NULL + +statement ok +create table test2 as select i%5 as a, i%2 as b, i as c from range(0,10) tbl(i); + +statement ok +copy (FROM test2 UNION ALL select 'NULL' as a, 'NULL' as b, 'NULL' as c) to '__TEST_DIR__/null-parquet' (PARTITION_BY (a,b), FORMAT 'parquet', OVERWRITE); + +query III +select * +from parquet_scan('__TEST_DIR__/null-parquet/**/*.parquet', hive_partitioning=1, hive_types={'a': INT}) +ORDER BY ALL +---- +0 0 0 +1 1 1 +2 2 0 +3 3 1 +4 4 0 +5 0 1 +6 1 0 +7 2 1 +8 3 0 +9 4 1 +NULL NULL NULL diff --git a/test/sql/copy/parquet/writer/parquet_write_field_id.test b/test/sql/copy/parquet/writer/parquet_write_field_id.test index c21ad661a08..802aef5c553 100644 --- a/test/sql/copy/parquet/writer/parquet_write_field_id.test +++ b/test/sql/copy/parquet/writer/parquet_write_field_id.test @@ -173,7 +173,7 @@ select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'ele statement error copy (select range(range, range + 3) as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,elem:43}}) ---- -Binder Error: Column name "elem" specified in FIELD_IDS not found. Available column names: [element] +Binder Error: Column name "elem" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names: [element] # struct statement ok @@ -193,7 +193,7 @@ select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'f' statement error copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,g:43}}) ---- -Binder Error: Column name "g" specified in FIELD_IDS not found. Available column names: [f] +Binder Error: Column name "g" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names: [f] # map statement ok diff --git a/test/sql/copy/partitioned/hive_partition_compression.test b/test/sql/copy/partitioned/hive_partition_compression.test index 93b1f8a6e0e..f3aea2634aa 100644 --- a/test/sql/copy/partitioned/hive_partition_compression.test +++ b/test/sql/copy/partitioned/hive_partition_compression.test @@ -18,8 +18,8 @@ query III FROM read_parquet('__TEST_DIR__/hive_partition_compress/*/*/*.parquet') ORDER BY ALL ---- -a bar 1 -a foo 1 -a foo 2 -b bar 1 +1 a bar +1 a foo +1 b bar +2 a foo diff --git a/test/sql/copy/partitioned/hive_partition_escape.test b/test/sql/copy/partitioned/hive_partition_escape.test index 2f5dd3e8f5d..49e4c5ec709 100644 --- a/test/sql/copy/partitioned/hive_partition_escape.test +++ b/test/sql/copy/partitioned/hive_partition_escape.test @@ -43,9 +43,22 @@ value with strings 1 statement ok ALTER TABLE weird_tbl RENAME COLUMN key TO "=/ \\/" +# this column name won't work with automatic HIVE partition due to the equal character statement ok COPY weird_tbl TO '__TEST_DIR__/escaped_partitions_names' (FORMAT PARQUET, PARTITION_BY("=/ \\/")) +statement error +select "=/ \\/", COUNT(*) +from parquet_scan('__TEST_DIR__/escaped_partitions_names/**/*.parquet') +GROUP BY ALL +ORDER BY ALL +---- +Binder Error: Referenced column "=/ \\/" not found in FROM clause! + + +# if we write the partition column on files, it can be read +statement ok +COPY weird_tbl TO '__TEST_DIR__/escaped_partitions_names' (FORMAT PARQUET, PARTITION_BY("=/ \\/"), OVERWRITE, WRITE_PARTITION_COLUMNS) query II select "=/ \\/", COUNT(*) diff --git a/test/sql/copy/partitioned/hive_partitioned_auto_detect.test b/test/sql/copy/partitioned/hive_partitioned_auto_detect.test index 9bc69bea361..a93d9f6e737 100644 --- a/test/sql/copy/partitioned/hive_partitioned_auto_detect.test +++ b/test/sql/copy/partitioned/hive_partitioned_auto_detect.test @@ -6,6 +6,7 @@ statement ok CREATE TABLE t AS SELECT i%2 AS year, i%3 AS month, i%4 AS c, i%5 AS d FROM RANGE(0,20) tbl(i); +# without partition columns written # test a csv partition by year statement ok COPY t TO '__TEST_DIR__/csv_partition_1' (partition_by(year)); @@ -15,6 +16,201 @@ select count(*) from glob('__TEST_DIR__/csv_partition_1/**'); ---- 2 +# with HIVE_PARTITIONING=0, directory names won't be read unless they are written in data +query III +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1; +---- +a b c + +# with HIVE_PARTITIONING, column name from directory name supercedes "names" parameter +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1; +---- +a b c year + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d']) LIMIT 1; +---- +a b c year + +# test a csv partition by year,month +statement ok +COPY t TO '__TEST_DIR__/csv_partition_2' (partition_by(year,month)); + +query I +select count(*) from glob('__TEST_DIR__/csv_partition_2/**'); +---- +6 + +query II +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1; +---- +a b + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1; +---- +a b month year + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d']) LIMIT 1; +---- +a b month year + +# test a single file +query I +select count(*) from glob('__TEST_DIR__/t.csv'); +---- +0 + +statement ok +COPY t TO '__TEST_DIR__/bad_file.csv'; + +query I +select count(*) from glob('__TEST_DIR__/bad_file.csv'); +---- +1 + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1; +---- +a b c d + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1; +---- +a b c d + +query IIII +select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d']) LIMIT 1; +---- +a b c d + +# add bad file to list: hive partitioning will be false, because scheme doesn't match +query II +select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=0, names=['a','b','c','d']) LIMIT 1; +---- +a b + +statement error +select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=1, names=['a','b','c','d']) LIMIT 1; +---- +Binder Error: Hive partition mismatch + +query II +select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], names=['a','b','c','d']) LIMIT 1; +---- +a b + + + +# same tests with parquet +require parquet + +# test a parquet partition by year +statement ok +COPY t TO '__TEST_DIR__/parquet_partition_1' (format parquet, partition_by(year)); + +query I +select count(*) from glob('__TEST_DIR__/parquet_partition_1/**'); +---- +2 + +query III +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=0) LIMIT 1; +---- +month c d + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=1) LIMIT 1; +---- +month c d year + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**') LIMIT 1; +---- +month c d year + +# test a parquet partition by year,month +statement ok +COPY t TO '__TEST_DIR__/parquet_partition_2' (format parquet, partition_by(year,month)); + +query I +select count(*) from glob('__TEST_DIR__/parquet_partition_2/**'); +---- +6 + +query II +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=0) LIMIT 1; +---- +c d + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=1) LIMIT 1; +---- +c d month year + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**') LIMIT 1; +---- +c d month year + +# test a single file +query I +select count(*) from glob('__TEST_DIR__/t.parquet'); +---- +0 + +statement ok +COPY t TO '__TEST_DIR__/t.parquet' (format parquet); + +query I +select count(*) from glob('__TEST_DIR__/t.parquet'); +---- +1 + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=0) LIMIT 1; +---- +year month c d + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=1) LIMIT 1; +---- +year month c d + +query IIII +select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet') LIMIT 1; +---- +year month c d + +# add bad file to list: hive partitioning will be false, because scheme doesn't match +query II +select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=0) LIMIT 1; +---- +c d + +statement error +select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=1) LIMIT 1; +---- +Binder Error: Hive partition mismatch + +query II +select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet']) LIMIT 1; +---- +c d + + +# with partition columns written +# test a csv partition by year +statement ok +COPY t TO '__TEST_DIR__/csv_partition_1' (partition_by(year), overwrite_or_ignore, write_partition_columns); + +query I +select count(*) from glob('__TEST_DIR__/csv_partition_1/**'); +---- +2 + query IIII select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1; ---- @@ -32,7 +228,7 @@ a b c d year # test a csv partition by year,month statement ok -COPY t TO '__TEST_DIR__/csv_partition_2' (partition_by(year,month)); +COPY t TO '__TEST_DIR__/csv_partition_2' (partition_by(year,month), overwrite_or_ignore, write_partition_columns); query I select count(*) from glob('__TEST_DIR__/csv_partition_2/**'); @@ -106,7 +302,7 @@ require parquet # test a parquet partition by year statement ok -COPY t TO '__TEST_DIR__/parquet_partition_1' (format parquet, partition_by(year)); +COPY t TO '__TEST_DIR__/parquet_partition_1' (format parquet, partition_by(year), overwrite_or_ignore, write_partition_columns); query I select count(*) from glob('__TEST_DIR__/parquet_partition_1/**'); @@ -130,7 +326,7 @@ year month c d # test a parquet partition by year,month statement ok -COPY t TO '__TEST_DIR__/parquet_partition_2' (format parquet, partition_by(year,month)); +COPY t TO '__TEST_DIR__/parquet_partition_2' (format parquet, partition_by(year,month), overwrite_or_ignore, write_partition_columns); query I select count(*) from glob('__TEST_DIR__/parquet_partition_2/**'); @@ -153,11 +349,6 @@ select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**' year month c d # test a single file -query I -select count(*) from glob('__TEST_DIR__/t.parquet'); ----- -0 - statement ok COPY t TO '__TEST_DIR__/t.parquet' (format parquet); @@ -197,9 +388,6 @@ select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/** ---- year month c d - - - query IIII select i,j,k,x from read_parquet('data/parquet-testing/hive-partitioning/union_by_name/*/*.parquet', hive_partitioning=0, union_by_name=1) diff --git a/test/sql/copy/partitioned/hive_partitioned_write.test b/test/sql/copy/partitioned/hive_partitioned_write.test index ac6104aa74e..01ac9388576 100644 --- a/test/sql/copy/partitioned/hive_partitioned_write.test +++ b/test/sql/copy/partitioned/hive_partitioned_write.test @@ -53,7 +53,7 @@ prefix-1 0 9 # Test partitioning by all statement ok -COPY test TO '__TEST_DIR__/partitioned3' (FORMAT PARQUET, PARTITION_BY '*'); +COPY test TO '__TEST_DIR__/partitioned3' (FORMAT PARQUET, PARTITION_BY '*', WRITE_PARTITION_COLUMNS); query I SELECT min(value2_col) as min_val diff --git a/test/sql/copy/partitioned/hive_partitioning_overwrite.test b/test/sql/copy/partitioned/hive_partitioning_overwrite.test index 4035c38c34c..eac9b8e117b 100644 --- a/test/sql/copy/partitioned/hive_partitioning_overwrite.test +++ b/test/sql/copy/partitioned/hive_partitioning_overwrite.test @@ -6,40 +6,40 @@ require parquet # write a partition with value 42 statement ok -COPY (SELECT 42 AS part_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col)); +COPY (SELECT 42 AS part_col, 43 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col)); # writing to the same directory fails now statement error -COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col)); +COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col)); ---- Enable OVERWRITE option to overwrite files # test the overwrite setting statement ok -COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1); +COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1); # the old file (with part_col=42) should now be removed -query I +query II SELECT * FROM '__TEST_DIR__/overwrite_test/**/*.parquet' ---- -84 +85 84 # what if the file is a file? statement ok COPY (SELECT 42 AS part_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET); statement error -COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col)); +COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col)); ---- it exists and is a file statement ok -COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1); +COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1); -query I +query II SELECT * FROM '__TEST_DIR__/overwrite_test2/**/*.parquet' ---- -84 +85 84 statement error COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1, OVERWRITE_OR_IGNORE 1); diff --git a/test/sql/copy/partitioned/partition_issue_6304.test b/test/sql/copy/partitioned/partition_issue_6304.test index d2ac8dc2adb..b4a0f43b087 100644 --- a/test/sql/copy/partitioned/partition_issue_6304.test +++ b/test/sql/copy/partitioned/partition_issue_6304.test @@ -5,7 +5,13 @@ require parquet statement ok -copy (select NULL as i from range(100000)) to '__TEST_DIR__/issue6304_null' (format parquet, partition_by(i), overwrite_or_ignore); +copy (select NULL as i, NULL as j from range(100000)) to '__TEST_DIR__/issue6304_null' (format parquet, partition_by(i), overwrite_or_ignore); statement ok -copy (select 1 as i from range(100000)) to '__TEST_DIR__/issue6304_constant' (format parquet, partition_by(i), overwrite_or_ignore); +copy (select 1 as i, 2 as j from range(100000)) to '__TEST_DIR__/issue6304_constant' (format parquet, partition_by(i), overwrite_or_ignore); + +statement ok +copy (select NULL as i from range(100000)) to '__TEST_DIR__/issue6304_null' (format parquet, partition_by(i), overwrite_or_ignore, write_partition_columns); + +statement ok +copy (select 1 as i from range(100000)) to '__TEST_DIR__/issue6304_constant' (format parquet, partition_by(i), overwrite_or_ignore, write_partition_columns); diff --git a/test/sql/copy/partitioned/partitioned_write_tpch.test_slow b/test/sql/copy/partitioned/partitioned_write_tpch.test_slow index cf6a31c0e2a..3fea9d045bf 100644 --- a/test/sql/copy/partitioned/partitioned_write_tpch.test_slow +++ b/test/sql/copy/partitioned/partitioned_write_tpch.test_slow @@ -51,10 +51,7 @@ DROP VIEW lineitem # try the CSV next - but copy it into a regular table statement ok -CREATE TABLE lineitem(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); - -statement ok -COPY lineitem FROM '__TEST_DIR__/lineitem_partitioned_csv/**/*.csv' +CREATE TABLE lineitem AS FROM read_csv('__TEST_DIR__/lineitem_partitioned_csv/**/*.csv') # now run tpc-h - results should be the same loop i 1 9 diff --git a/test/sql/copy/partitioned/skip_partition_column_writes.test b/test/sql/copy/partitioned/skip_partition_column_writes.test new file mode 100644 index 00000000000..815f4039207 --- /dev/null +++ b/test/sql/copy/partitioned/skip_partition_column_writes.test @@ -0,0 +1,240 @@ +# name: test/sql/copy/partitioned/skip_partition_column_writes.test +# description: Skip partition column writes (issue 11931 & 12147) +# group: [partitioned] + +require parquet + +statement ok +CREATE TABLE test as SELECT i%2 as part_col, (i+1)%5 as value_col, i as value2_col from range(0,10) tbl(i); + +# Parquet + +# Skip write of the first partition column +statement ok +COPY test TO '__TEST_DIR__/no-part-cols' (FORMAT PARQUET, PARTITION_BY (part_col)); + +# SELECT query returns all columns, but written files do not have partition columns +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols/part_col=0/*.parquet' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# Skip writes of 2 partition columns +statement ok +COPY test TO '__TEST_DIR__/no-part-cols2' (FORMAT PARQUET, PARTITION_BY (part_col, value_col)); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols2/part_col=0/value_col=*/*.parquet' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# Modified version of the partition_col +statement ok +COPY (SELECT * EXCLUDE (part_col), 'prefix-'::VARCHAR || part_col::VARCHAR as part_col FROM test) TO '__TEST_DIR__/no-part-cols3' (FORMAT PARQUET, PARTITION_BY (part_col)); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols3/part_col=prefix-0/*.parquet' ORDER BY value2_col; +---- +prefix-0 1 0 +prefix-0 3 2 +prefix-0 0 4 +prefix-0 2 6 +prefix-0 4 8 + +# Partitions of more than 8 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols4' (FORMAT PARQUET, PARTITION_BY (part_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols4/part_col=1/*.parquet' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last column out of 10 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols5' (FORMAT PARQUET, PARTITION_BY (value9_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols5/value9_col=*/*.parquet' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last 2 columns out of 10 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols6' (FORMAT PARQUET, PARTITION_BY (value8_col, value9_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols6/value8_col=*/value9_col=*/*.parquet' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last 3 columns out of 10 columns in a reverse order +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols7' (FORMAT PARQUET, PARTITION_BY (value9_col, value8_col, value7_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols7/value9_col=*/value8_col=*/value7_col=*/*.parquet' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Throw an error when all columns are specified as partitions +statement error +COPY test TO '__TEST_DIR__/no-part-cols8' (FORMAT PARQUET, PARTITION_BY (part_col, value_col, value2_col)); +---- +Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns. + +# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable. +statement ok +COPY test TO '__TEST_DIR__/no-part-cols8' (FORMAT PARQUET, OVERWRITE, PARTITION_BY (part_col, value_col, value2_col), WRITE_PARTITION_COLUMNS); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols8/part_col=0/value_col=*/value2_col=*/*.parquet' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# '*' also ends up with error +statement error +COPY test TO '__TEST_DIR__/no-part-cols9' (FORMAT PARQUET, PARTITION_BY '*'); +---- +Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns. + +# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable. +statement ok +COPY test TO '__TEST_DIR__/no-part-cols9' (FORMAT PARQUET, PARTITION_BY '*', OVERWRITE, WRITE_PARTITION_COLUMNS); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols9/part_col=0/value_col=*/value2_col=*/*.parquet' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# CSV + +# Skip write of the first partition column +statement ok +COPY test TO '__TEST_DIR__/csv-no-part-cols' (FORMAT CSV, PARTITION_BY (part_col)); + +# SELECT query returns all columns, but written files do not have partition columns +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols/part_col=0/*.csv' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# Skip writes of 2 partition columns +statement ok +COPY test TO '__TEST_DIR__/csv-no-part-cols2' (FORMAT CSV, PARTITION_BY (part_col, value_col)); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols2/part_col=0/value_col=*/*.csv' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# Modified version of the partition_col +statement ok +COPY (SELECT * EXCLUDE (part_col), 'prefix-'::VARCHAR || part_col::VARCHAR as part_col FROM test) TO '__TEST_DIR__/csv-no-part-cols3' (FORMAT CSV, PARTITION_BY (part_col)); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols3/part_col=prefix-0/*.csv' ORDER BY value2_col; +---- +prefix-0 1 0 +prefix-0 3 2 +prefix-0 0 4 +prefix-0 2 6 +prefix-0 4 8 + +# Partitions of more than 8 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols4' (FORMAT CSV, PARTITION_BY (part_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols4/part_col=1/*.csv' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last column out of 10 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols5' (FORMAT CSV, PARTITION_BY (value9_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols5/value9_col=*/*.csv' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last 2 columns out of 10 columns +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols6' (FORMAT CSV, PARTITION_BY (value8_col, value9_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols6/value8_col=*/value9_col=*/*.csv' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Partition by last 3 columns out of 10 columns in a reverse order +statement ok +COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols7' (FORMAT CSV, PARTITION_BY (value9_col, value8_col, value7_col)); + +query IIIIIIIIII +SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols7/value9_col=*/value8_col=*/value7_col=*/*.csv' ORDER BY 1; +---- +1 2 3 4 5 6 7 8 9 10 + +# Throw an error when all columns are specified as partitions +statement error +COPY test TO '__TEST_DIR__/csv-no-part-cols8' (FORMAT CSV, PARTITION_BY (part_col, value_col, value2_col)); +---- +Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns. + +# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable. +statement ok +COPY test TO '__TEST_DIR__/csv-no-part-cols8' (FORMAT CSV, PARTITION_BY (part_col, value_col, value2_col), OVERWRITE, WRITE_PARTITION_COLUMNS); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols8/part_col=0/value_col=*/value2_col=*/*.csv' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 + +# '*' also ends up with error +statement error +COPY test TO '__TEST_DIR__/csv-no-part-cols9' (FORMAT CSV, PARTITION_BY '*'); +---- +Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns. + +# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable. +statement ok +COPY test TO '__TEST_DIR__/csv-no-part-cols9' (FORMAT CSV, PARTITION_BY '*', OVERWRITE, WRITE_PARTITION_COLUMNS); + +query III +SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols9/part_col=0/value_col=*/value2_col=*/*.csv' ORDER BY value2_col; +---- +0 1 0 +0 3 2 +0 0 4 +0 2 6 +0 4 8 diff --git a/test/sql/json/table/json_multi_file_reader.test b/test/sql/json/table/json_multi_file_reader.test index 80bd8b20e45..a56f7aee1ca 100644 --- a/test/sql/json/table/json_multi_file_reader.test +++ b/test/sql/json/table/json_multi_file_reader.test @@ -5,7 +5,7 @@ require json statement ok -create table test as SELECT to_json([i%4]) as j FROM range(0,20) as tbl(i) +create table test as SELECT i as i, to_json([i%4]) as j FROM range(0,20) as tbl(i) # FIXME: we can't do partitioned JSON writes yet because the column we partition by is packed into a to_json # because we just push an expression and then use the csv writer, this uses the csv writer for now diff --git a/test/sql/pragma/test_custom_profiling_settings.test b/test/sql/pragma/test_custom_profiling_settings.test index 913cb25a002..1ea51cb9317 100644 --- a/test/sql/pragma/test_custom_profiling_settings.test +++ b/test/sql/pragma/test_custom_profiling_settings.test @@ -31,6 +31,7 @@ SELECT unnest(res) from ( ) ---- "CPU_TIME": "true" +"CUMULATIVE_CARDINALITY": "true" "EXTRA_INFO": "true" "OPERATOR_CARDINALITY": "true" "OPERATOR_TIMING": "true" @@ -80,6 +81,12 @@ SELECT operator_timing FROM metrics_output; ---- Referenced column "operator_timing" not found in FROM clause! +statement error +SELECT cumulative_cardinality FROM metrics_output; +---- +Referenced column "cumulative_cardinality" not found in FROM clause! + + # change the cpu time to false and the rest to true and re-run the query statement ok PRAGMA custom_profiling_settings='{"CPU_TIME": "false", "EXTRA_INFO": "true", "OPERATOR_CARDINALITY": "true", "OPERATOR_TIMING": "true"}' @@ -115,9 +122,9 @@ Referenced column "cpu_time" not found in FROM clause! statement ok SELECT extra_info, operator_cardinality, operator_timing FROM metrics_output; -# Remove time from the settings file but add back the cpu time +# Remove time and operator cardinality from the settings file but add back the cpu time and cumulative cardinality statement ok -PRAGMA custom_profiling_settings='{"CPU_TIME": "true", "EXTRA_INFO": "true", "OPERATOR_CARDINALITY": "true"}' +PRAGMA custom_profiling_settings='{"CPU_TIME": "true", "EXTRA_INFO": "true", "CUMULATIVE_CARDINALITY": "true"}' query I rowsort SELECT unnest(res) from ( @@ -127,8 +134,8 @@ SELECT unnest(res) from ( ) ---- "CPU_TIME": "true" +"CUMULATIVE_CARDINALITY": "true" "EXTRA_INFO": "true" -"OPERATOR_CARDINALITY": "true" statement ok PRAGMA enable_profiling = 'json'; @@ -153,6 +160,18 @@ FROM metrics_output; ---- true +# Even though operator cardinality is set to false, it still should have been collected so that cumulative cardinality can be calculated +query I +SELECT +CASE + WHEN cumulative_cardinality > 0 THEN 'true' + ELSE 'false' +END +FROM metrics_output; +---- +true + + statement error SELECT operator_timing FROM metrics_output; ---- diff --git a/test/sql/secrets/create_secret_persistence_error_handling.test b/test/sql/secrets/create_secret_persistence_error_handling.test index 585126399b5..957f0a2732a 100644 --- a/test/sql/secrets/create_secret_persistence_error_handling.test +++ b/test/sql/secrets/create_secret_persistence_error_handling.test @@ -14,7 +14,7 @@ set secret_directory='__TEST_DIR__/create_secret_persistence_error_handling' # Hacky way to make duckdb create the create_secret_persistence_error_handling dir statement ok -COPY (select 1 as a ) to '__TEST_DIR__/create_secret_persistence_error_handling/' (FORMAT csv, PARTITION_BY a) +COPY (select 1 as a, 2 as b ) to '__TEST_DIR__/create_secret_persistence_error_handling/' (FORMAT csv, PARTITION_BY a) # Now write a corrupt secret file statement ok diff --git a/test/sql/types/nested/list/test_list_extract.test b/test/sql/types/nested/list/test_list_extract.test index 5c7af4310e8..bf19b634209 100644 --- a/test/sql/types/nested/list/test_list_extract.test +++ b/test/sql/types/nested/list/test_list_extract.test @@ -194,3 +194,40 @@ query I SELECT list_extract([1, 2, 3], -9223372036854775808); ---- NULL + +statement ok +CREATE TABLE list_array_table(a int[3][]); + +statement ok +INSERT INTO list_array_table VALUES ([[1,2,3], NULL, [4,5,6]]); + +query I +SELECT list_extract(a, 1) FROM list_array_table; +---- +[1, 2, 3] + +query I +SELECT list_extract(a, 2) FROM list_array_table; +---- +NULL + +query I +SELECT list_extract(a, 3) FROM list_array_table; +---- +[4, 5, 6] + +query I +SELECT list_extract(a, 4) FROM list_array_table; +---- +NULL + +query I +SELECT list_extract(a, -1) FROM list_array_table; +---- +[4, 5, 6] + +query I +SELECT list_extract(a, 0) FROM list_array_table; +---- +NULL + diff --git a/tools/pythonpkg/README.md b/tools/pythonpkg/README.md index 09f65bfeb9f..32b720bad33 100644 --- a/tools/pythonpkg/README.md +++ b/tools/pythonpkg/README.md @@ -49,23 +49,28 @@ storage from a notebook. First, get the repository based version number and extract the source distribution. - python3 -m pip install build # required for pep517 compliant source dists - cd tools/pythonpkg - export SETUPTOOLS_SCM_PRETEND_VERSION=$(python3 -m setuptools_scm) - pyproject-build . --sdist - cd ../.. +```bash +python3 -m pip install build # required for PEP 517 compliant source dists +cd tools/pythonpkg +export SETUPTOOLS_SCM_PRETEND_VERSION=$(python3 -m setuptools_scm) +pyproject-build . --sdist +cd ../.. +``` Next, copy over the python package related files, and install the package. - mkdir -p $DUCKDB_PREFIX/src/duckdb-pythonpkg - tar --directory=$DUCKDB_PREFIX/src/duckdb-pythonpkg -xzpf tools/pythonpkg/dist/duckdb-${SETUPTOOLS_SCM_PRETEND_VERSION}.tar.gz - pip3 install --prefix $DUCKDB_PREFIX -e $DUCKDB_PREFIX/src/duckdb-pythonpkg/duckdb-${SETUPTOOLS_SCM_PRETEND_VERSION} +```bash +mkdir -p $DUCKDB_PREFIX/src/duckdb-pythonpkg +tar --directory=$DUCKDB_PREFIX/src/duckdb-pythonpkg -xzpf tools/pythonpkg/dist/duckdb-${SETUPTOOLS_SCM_PRETEND_VERSION}.tar.gz +pip3 install --prefix $DUCKDB_PREFIX -e $DUCKDB_PREFIX/src/duckdb-pythonpkg/duckdb-${SETUPTOOLS_SCM_PRETEND_VERSION} +``` ## Development and Stubs `*.pyi` stubs are generated with [Mypy's `stubgen`](https://mypy.readthedocs.io/en/stable/stubgen.html) and tweaked. These are important for autocomplete in many IDEs, as static-analysis based language servers can't introspect `duckdb`'s binary module. -The stubs from stubgen are pretty good, but not perfect. In some cases, you can help stubgen out: for example, function annotation types that it can't figure out should be specified in the cpp where necessary, as in the example +The stubs from stubgen are pretty good, but not perfect. In some cases, you can help stubgen out: for example, function annotation types that it can't figure out should be specified in the cpp where necessary, as in the example. + ```cpp // without this change, the generated stub is // def query_df(self, df: object, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ... @@ -87,7 +92,7 @@ There is a test that you can run to check the stubs match the real duckdb packag The workflow for getting the stubs right will look something like -```sh +```bash # Edit python package... vim tools/pythonpkg/duckdb_python.cpp # or whatever @@ -107,10 +112,11 @@ pytest tests/stubs All the above should be done in a virtualenv. -## Frequently encountered issue with extensions: +## Frequently encountered issue with extensions If you are faced with an error on `import duckdb`: -``` + +```console Traceback (most recent call last): File "", line 1, in File "/usr/bin/python3/site-packages/duckdb/__init__.py", line 4, in @@ -144,6 +150,7 @@ Helpful information: `clang-tidy` is not a standard binary on MacOS, and can not be installed with brew directly (doing so will try to install clang-format, and they are not the same thing) Instead clang-tidy is part of `llvm`, so you'll need to install that (`brew install llvm`), after installing llvm you'll likely have to add the llvm binaries folder to your PATH variable to use clang-tidy For example: + ```bash export PATH="$PATH:/opt/homebrew/Cellar/llvm/15.0.2/bin" ``` diff --git a/tools/pythonpkg/duckdb-stubs/__init__.pyi b/tools/pythonpkg/duckdb-stubs/__init__.pyi index 92f6dcc4f99..efe3b1e1b43 100644 --- a/tools/pythonpkg/duckdb-stubs/__init__.pyi +++ b/tools/pythonpkg/duckdb-stubs/__init__.pyi @@ -673,7 +673,7 @@ def install_extension(extension: str, *, force_install: bool = False, connection def load_extension(extension: str, *, connection: DuckDBPyConnection = ...) -> None: ... def project(df: pandas.DataFrame, *args: str, groups: str = "", connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def distinct(df: pandas.DataFrame, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -def write_csv(df: pandas.DataFrame, filename: str, *, sep: Optional[str] = None, na_rep: Optional[str] = None, header: Optional[bool] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, quoting: Optional[str | int] = None, encoding: Optional[str] = None, compression: Optional[str] = None, overwrite: Optional[bool] = None, per_thread_output: Optional[bool] = None, use_tmp_file: Optional[bool] = None, partition_by: Optional[List[str]] = None, connection: DuckDBPyConnection = ...) -> None: ... +def write_csv(df: pandas.DataFrame, filename: str, *, sep: Optional[str] = None, na_rep: Optional[str] = None, header: Optional[bool] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, quoting: Optional[str | int] = None, encoding: Optional[str] = None, compression: Optional[str] = None, overwrite: Optional[bool] = None, per_thread_output: Optional[bool] = None, use_tmp_file: Optional[bool] = None, partition_by: Optional[List[str]] = None, write_partition_columns: Optional[bool] = None, connection: DuckDBPyConnection = ...) -> None: ... def aggregate(df: pandas.DataFrame, aggr_expr: str | List[Expression], group_expr: str = "", *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def alias(df: pandas.DataFrame, alias: str, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def filter(df: pandas.DataFrame, filter_expr: str, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... diff --git a/tools/pythonpkg/duckdb_python.cpp b/tools/pythonpkg/duckdb_python.cpp index 48159022e81..ef6ae0923fa 100644 --- a/tools/pythonpkg/duckdb_python.cpp +++ b/tools/pythonpkg/duckdb_python.cpp @@ -852,13 +852,14 @@ static void InitializeConnectionMethods(py::module_ &m) { const py::object "ing = py::none(), const py::object &encoding = py::none(), const py::object &compression = py::none(), const py::object &overwrite = py::none(), const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(), - const py::object &partition_by = py::none(), shared_ptr conn = nullptr) { + const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(), + shared_ptr conn = nullptr) { if (!conn) { conn = DuckDBPyConnection::DefaultConnection(); } conn->FromDF(df)->ToCSV(filename, sep, na_rep, header, quotechar, escapechar, date_format, timestamp_format, quoting, encoding, compression, overwrite, per_thread_output, use_tmp_file, - partition_by); + partition_by, write_partition_columns); }, "Write the relation object to a CSV file in 'file_name'", py::arg("df"), py::arg("filename"), py::kw_only(), py::arg("sep") = py::none(), py::arg("na_rep") = py::none(), py::arg("header") = py::none(), @@ -866,7 +867,8 @@ static void InitializeConnectionMethods(py::module_ &m) { py::arg("timestamp_format") = py::none(), py::arg("quoting") = py::none(), py::arg("encoding") = py::none(), py::arg("compression") = py::none(), py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(), py::arg("use_tmp_file") = py::none(), - py::arg("partition_by") = py::none(), py::arg("connection") = py::none()); + py::arg("partition_by") = py::none(), py::arg("write_partition_columns") = py::none(), + py::arg("connection") = py::none()); m.def( "aggregate", [](const PandasDataFrame &df, const py::object &expr, const string &groups = "", diff --git a/tools/pythonpkg/scripts/connection_wrapper_methods.json b/tools/pythonpkg/scripts/connection_wrapper_methods.json index 472531ba773..19e01c815f1 100644 --- a/tools/pythonpkg/scripts/connection_wrapper_methods.json +++ b/tools/pythonpkg/scripts/connection_wrapper_methods.json @@ -103,6 +103,10 @@ "name": "partition_by", "type": "Optional[List[str]]", "default": "None" + }, { + "name": "write_partition_columns", + "type": "Optional[bool]", + "default": "None" } ], "docs": "Write the relation object to a CSV file in 'file_name'", diff --git a/tools/pythonpkg/src/include/duckdb_python/pyrelation.hpp b/tools/pythonpkg/src/include/duckdb_python/pyrelation.hpp index 93c2d56cb69..77ed7e762f1 100644 --- a/tools/pythonpkg/src/include/duckdb_python/pyrelation.hpp +++ b/tools/pythonpkg/src/include/duckdb_python/pyrelation.hpp @@ -216,7 +216,8 @@ struct DuckDBPyRelation { const py::object ×tamp_format = py::none(), const py::object "ing = py::none(), const py::object &encoding = py::none(), const py::object &compression = py::none(), const py::object &overwrite = py::none(), const py::object &per_thread_output = py::none(), - const py::object &use_tmp_file = py::none(), const py::object &partition_by = py::none()); + const py::object &use_tmp_file = py::none(), const py::object &partition_by = py::none(), + const py::object &write_partition_columns = py::none()); // should this return a rel with the new view? unique_ptr CreateView(const string &view_name, bool replace = true); diff --git a/tools/pythonpkg/src/pyrelation.cpp b/tools/pythonpkg/src/pyrelation.cpp index 3c99360e6c1..39e36a9425e 100644 --- a/tools/pythonpkg/src/pyrelation.cpp +++ b/tools/pythonpkg/src/pyrelation.cpp @@ -1171,7 +1171,8 @@ void DuckDBPyRelation::ToCSV(const string &filename, const py::object &sep, cons const py::object &date_format, const py::object ×tamp_format, const py::object "ing, const py::object &encoding, const py::object &compression, const py::object &overwrite, const py::object &per_thread_output, - const py::object &use_tmp_file, const py::object &partition_by) { + const py::object &use_tmp_file, const py::object &partition_by, + const py::object &write_partition_columns) { case_insensitive_map_t> options; if (!py::none().is(sep)) { @@ -1298,6 +1299,13 @@ void DuckDBPyRelation::ToCSV(const string &filename, const py::object &sep, cons options["partition_by"] = {partition_by_values}; } + if (!py::none().is(write_partition_columns)) { + if (!py::isinstance(write_partition_columns)) { + throw InvalidInputException("to_csv only accepts 'write_partition_columns' as a boolean"); + } + options["write_partition_columns"] = {Value::BOOLEAN(py::bool_(write_partition_columns))}; + } + auto write_csv = rel->WriteCSVRel(filename, std::move(options)); PyExecuteRelation(write_csv); } diff --git a/tools/pythonpkg/src/pyrelation/initialize.cpp b/tools/pythonpkg/src/pyrelation/initialize.cpp index 008f172ff85..3a7a507cb84 100644 --- a/tools/pythonpkg/src/pyrelation/initialize.cpp +++ b/tools/pythonpkg/src/pyrelation/initialize.cpp @@ -35,14 +35,14 @@ static void InitializeConsumers(py::class_ &m) { py::arg("compression") = py::none(), py::arg("field_ids") = py::none(), py::arg("row_group_size_bytes") = py::none(), py::arg("row_group_size") = py::none()); - DefineMethod({"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, - "Write the relation object to a CSV file in 'file_name'", py::arg("file_name"), py::kw_only(), - py::arg("sep") = py::none(), py::arg("na_rep") = py::none(), py::arg("header") = py::none(), - py::arg("quotechar") = py::none(), py::arg("escapechar") = py::none(), - py::arg("date_format") = py::none(), py::arg("timestamp_format") = py::none(), - py::arg("quoting") = py::none(), py::arg("encoding") = py::none(), py::arg("compression") = py::none(), - py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(), - py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none()); + DefineMethod( + {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'", + py::arg("file_name"), py::kw_only(), py::arg("sep") = py::none(), py::arg("na_rep") = py::none(), + py::arg("header") = py::none(), py::arg("quotechar") = py::none(), py::arg("escapechar") = py::none(), + py::arg("date_format") = py::none(), py::arg("timestamp_format") = py::none(), py::arg("quoting") = py::none(), + py::arg("encoding") = py::none(), py::arg("compression") = py::none(), py::arg("overwrite") = py::none(), + py::arg("per_thread_output") = py::none(), py::arg("use_tmp_file") = py::none(), + py::arg("partition_by") = py::none(), py::arg("write_partition_columns") = py::none()); m.def("fetchone", &DuckDBPyRelation::FetchOne, "Execute and fetch a single row as a tuple") .def("fetchmany", &DuckDBPyRelation::FetchMany, "Execute and fetch the next set of rows as a list of tuples", diff --git a/tools/pythonpkg/tests/fast/api/test_to_csv.py b/tools/pythonpkg/tests/fast/api/test_to_csv.py index 0b137327710..5a3ca3d0f63 100644 --- a/tools/pythonpkg/tests/fast/api/test_to_csv.py +++ b/tools/pythonpkg/tests/fast/api/test_to_csv.py @@ -180,6 +180,32 @@ def test_to_csv_partition(self, pandas): csv_rel = duckdb.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' ) + expected = [ + (True, 1.0, 42, 'a', 'a'), + (False, 3.2, None, 'b,c', 'a'), + (True, 3.0, 123, 'e', 'b'), + (True, 4.0, 321, 'f', 'b'), + ] + + assert csv_rel.execute().fetchall() == expected + + @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) + def test_to_csv_partition_with_columns_written(self, pandas): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + df = pandas.DataFrame( + { + "c_category": ['a', 'a', 'b', 'b'], + "c_bool": [True, False, True, True], + "c_float": [1.0, 3.2, 3.0, 4.0], + "c_int": [42, None, 123, 321], + "c_string": ["a", "b,c", "e", "f"], + } + ) + rel = duckdb.from_df(df) + rel.to_csv(temp_file_name, header=True, partition_by=["c_category"], write_partition_columns=True) + csv_rel = duckdb.sql( + f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' + ) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) @@ -201,6 +227,39 @@ def test_to_csv_overwrite(self, pandas): csv_rel = duckdb.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' ) + # When partition columns are read from directory names, column order become different from original + expected = [ + ('c', True, 1.0, 42, 'a', 'a'), + ('c', False, 3.2, None, 'b,c', 'a'), + ('d', True, 3.0, 123, 'e', 'b'), + ('d', True, 4.0, 321, 'f', 'b'), + ] + + assert csv_rel.execute().fetchall() == expected + + @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) + def test_to_csv_overwrite_with_columns_written(self, pandas): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + df = pandas.DataFrame( + { + "c_category_1": ['a', 'a', 'b', 'b'], + "c_category_2": ['c', 'c', 'd', 'd'], + "c_bool": [True, False, True, True], + "c_float": [1.0, 3.2, 3.0, 4.0], + "c_int": [42, None, 123, 321], + "c_string": ["a", "b,c", "e", "f"], + } + ) + rel = duckdb.from_df(df) + rel.to_csv( + temp_file_name, header=True, partition_by=["c_category_1"], write_partition_columns=True + ) # csv to be overwritten + rel.to_csv( + temp_file_name, header=True, partition_by=["c_category_1"], overwrite=True, write_partition_columns=True + ) + csv_rel = duckdb.sql( + f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' + ) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) diff --git a/tools/pythonpkg/tests/fast/test_filesystem.py b/tools/pythonpkg/tests/fast/test_filesystem.py index 0486298bf72..1bc4dcfb761 100644 --- a/tools/pythonpkg/tests/fast/test_filesystem.py +++ b/tools/pythonpkg/tests/fast/test_filesystem.py @@ -191,13 +191,52 @@ def test_database_attach(self, tmp_path: Path, monkeypatch: MonkeyPatch): def test_copy_partition(self, duckdb_cursor: DuckDBPyConnection, memory: AbstractFileSystem): duckdb_cursor.register_filesystem(memory) - duckdb_cursor.execute("copy (select 1 as a) to 'memory://root' (partition_by (a), HEADER 0)") + duckdb_cursor.execute("copy (select 1 as a, 2 as b) to 'memory://root' (partition_by (a), HEADER 0)") + + assert memory.open('/root/a=1/data_0.csv').read() == b'2\n' + + def test_copy_partition_with_columns_written(self, duckdb_cursor: DuckDBPyConnection, memory: AbstractFileSystem): + duckdb_cursor.register_filesystem(memory) + + duckdb_cursor.execute( + "copy (select 1 as a) to 'memory://root' (partition_by (a), HEADER 0, WRITE_PARTITION_COLUMNS)" + ) assert memory.open('/root/a=1/data_0.csv').read() == b'1\n' def test_read_hive_partition(self, duckdb_cursor: DuckDBPyConnection, memory: AbstractFileSystem): duckdb_cursor.register_filesystem(memory) - duckdb_cursor.execute("copy (select 2 as a) to 'memory://partition' (partition_by (a), HEADER 0)") + duckdb_cursor.execute( + "copy (select 2 as a, 3 as b, 4 as c) to 'memory://partition' (partition_by (a), HEADER 0)" + ) + + path = 'memory:///partition/*/*.csv' + + query = "SELECT * FROM read_csv_auto('" + path + "'" + + # hive partitioning + duckdb_cursor.execute(query + ', HIVE_PARTITIONING=1' + ');') + assert duckdb_cursor.fetchall() == [(3, 4, 2)] + + # hive partitioning: auto detection + duckdb_cursor.execute(query + ');') + assert duckdb_cursor.fetchall() == [(3, 4, 2)] + + # hive partitioning: cast to int + duckdb_cursor.execute(query + ', HIVE_PARTITIONING=1' + ', HIVE_TYPES_AUTOCAST=1' + ');') + assert duckdb_cursor.fetchall() == [(3, 4, 2)] + + # hive partitioning: no cast to int + duckdb_cursor.execute(query + ', HIVE_PARTITIONING=1' + ', HIVE_TYPES_AUTOCAST=0' + ');') + assert duckdb_cursor.fetchall() == [(3, 4, '2')] + + def test_read_hive_partition_with_columns_written( + self, duckdb_cursor: DuckDBPyConnection, memory: AbstractFileSystem + ): + duckdb_cursor.register_filesystem(memory) + duckdb_cursor.execute( + "copy (select 2 as a) to 'memory://partition' (partition_by (a), HEADER 0, WRITE_PARTITION_COLUMNS)" + ) path = 'memory:///partition/*/*.csv'