merge with feature

lnkuiper · Oct 15, 2024 · 5e4f2e1 · 5e4f2e1
2 parents 9e782f1 + b706529
commit 5e4f2e1
Show file tree

Hide file tree

Showing 40 changed files with 353 additions and 100 deletions.
diff --git a/.github/config/distribution_matrix.json b/.github/config/distribution_matrix.json
@@ -39,7 +39,7 @@
         "vcpkg_triplet": "x64-windows-static-md"
       },
       {
-        "duckdb_arch": "windows_amd64_rtools",
+        "duckdb_arch": "windows_amd64_mingw",
         "vcpkg_triplet": "x64-mingw-static"
       }
     ]

diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake
@@ -39,7 +39,7 @@ if (NOT MINGW)
     duckdb_extension_load(azure
             LOAD_TESTS
             GIT_URL https://github.com/duckdb/duckdb_azure
-            GIT_TAG b0ffe7ada20cdbd0bee2bbe5461ecd22fb468062
+            GIT_TAG a40ecb7bc9036eb8ecc5bf30db935a31b78011f5
             APPLY_PATCHES
             )
 endif()
@@ -51,7 +51,7 @@ if (NOT MINGW AND NOT "${OS_NAME}" STREQUAL "linux")
     duckdb_extension_load(delta
             LOAD_TESTS
             GIT_URL https://github.com/duckdb/duckdb_delta
-            GIT_TAG 3933ebd800ad06a64656c9aef6ca7d62897fa4db
+            GIT_TAG 811db25f5bd405dea186d6c461a642a387502ad8
             APPLY_PATCHES
     )
 endif()
@@ -76,7 +76,7 @@ if (NOT MINGW)
     duckdb_extension_load(iceberg
             ${LOAD_ICEBERG_TESTS}
             GIT_URL https://github.com/duckdb/duckdb_iceberg
-            GIT_TAG 3f6d753787252e3da1d12157910b62edf729fc6e
+            GIT_TAG 8b48d1261564613274ac8e9fae01e572d965c99d
             APPLY_PATCHES
             )
 endif()
@@ -106,7 +106,7 @@ endif()
 duckdb_extension_load(spatial
     DONT_LINK LOAD_TESTS
     GIT_URL https://github.com/duckdb/duckdb_spatial.git
-    GIT_TAG bb9c829693965f029eb5a312aefed4c538fad781
+    GIT_TAG 3f94d52aa9f7d67b1a30e6cea642bbb790c04aa2
     INCLUDE_DIR spatial/include
     TEST_DIR test/sql
     APPLY_PATCHES
@@ -123,7 +123,7 @@ endif()
 duckdb_extension_load(sqlite_scanner
         ${STATIC_LINK_SQLITE} LOAD_TESTS
         GIT_URL https://github.com/duckdb/sqlite_scanner
-        GIT_TAG 315861963c8106397af36cbda10faebc8dae485a
+        GIT_TAG d5d62657702d33cb44a46cddc7ffc4b67bf7e961
         APPLY_PATCHES
         )
 
@@ -149,7 +149,7 @@ duckdb_extension_load(vss
         LOAD_TESTS
         DONT_LINK
         GIT_URL https://github.com/duckdb/duckdb_vss
-        GIT_TAG 77739ea5382cce3220af83803ac0b1e98b3ab7d8
+        GIT_TAG dd880d6121c0f3dff27131e54e057c9db0f1c710
         TEST_DIR test/sql
         APPLY_PATCHES
     )

diff --git a/.github/workflows/R.yml b/.github/workflows/R.yml
@@ -62,8 +62,8 @@ jobs:
 
       - uses: ./.github/actions/build_extensions
         with:
-          deploy_as: windows_amd64_rtools
-          duckdb_arch: windows_amd64_rtools
+          deploy_as: windows_amd64_mingw
+          duckdb_arch: windows_amd64_mingw
           vcpkg_target_triplet: x64-mingw-static
           treat_warn_as_error: 0
           s3_id: ${{ secrets.S3_ID }}

diff --git a/data/csv/glob_dif_dialect/14166/__2000.csv b/data/csv/glob_dif_dialect/14166/__2000.csv
@@ -0,0 +1,3 @@
+date_col,int_col,double_col
+2000-01-01,10,80.9189441112103
+2000-01-02,5,109.16581782022259
diff --git a/data/csv/glob_dif_dialect/14166/__2001.csv b/data/csv/glob_dif_dialect/14166/__2001.csv
@@ -0,0 +1 @@
+date_col,int_col,double_col
diff --git a/data/csv/glob_dif_dialect/14166/empty.csv b/data/csv/glob_dif_dialect/14166/empty.csv
diff --git a/data/csv/glob_dif_dialect/14166/matching_types.csv b/data/csv/glob_dif_dialect/14166/matching_types.csv
@@ -0,0 +1 @@
+2003-01-02,5,109.16581782022259
diff --git a/extension/core_functions/scalar/date/date_diff.cpp b/extension/core_functions/scalar/date/date_diff.cpp
@@ -28,6 +28,14 @@ struct DateDiff {
 		    });
 	}
 
+	//	We need to truncate down, not towards 0
+	static inline int64_t Truncate(int64_t value, int64_t units) {
+		return (value + (value < 0)) / units - (value < 0);
+	}
+	static inline int64_t Diff(int64_t start, int64_t end, int64_t units) {
+		return Truncate(end, units) - Truncate(start, units);
+	}
+
 	struct YearOperator {
 		template <class TA, class TB, class TR>
 		static inline TR Operation(TA startdate, TB enddate) {
@@ -204,30 +212,28 @@ template <>
 int64_t DateDiff::MillisecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
 	D_ASSERT(Timestamp::IsFinite(startdate));
 	D_ASSERT(Timestamp::IsFinite(enddate));
-	return Timestamp::GetEpochMs(enddate) - Timestamp::GetEpochMs(startdate);
+	return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MSEC);
 }
 
 template <>
 int64_t DateDiff::SecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
 	D_ASSERT(Timestamp::IsFinite(startdate));
 	D_ASSERT(Timestamp::IsFinite(enddate));
-	return Timestamp::GetEpochSeconds(enddate) - Timestamp::GetEpochSeconds(startdate);
+	return Diff(startdate.value, enddate.value, Interval::MICROS_PER_SEC);
 }
 
 template <>
 int64_t DateDiff::MinutesOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
 	D_ASSERT(Timestamp::IsFinite(startdate));
 	D_ASSERT(Timestamp::IsFinite(enddate));
-	return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_MINUTE -
-	       Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_MINUTE;
+	return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MINUTE);
 }
 
 template <>
 int64_t DateDiff::HoursOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
 	D_ASSERT(Timestamp::IsFinite(startdate));
 	D_ASSERT(Timestamp::IsFinite(enddate));
-	return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_HOUR -
-	       Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_HOUR;
+	return Diff(startdate.value, enddate.value, Interval::MICROS_PER_HOUR);
 }
 
 // TIME specialisations

diff --git a/extension/json/json_functions/json_structure.cpp b/extension/json/json_functions/json_structure.cpp
@@ -716,7 +716,8 @@ static LogicalType StructureToTypeObject(ClientContext &context, const JSONStruc
 	}
 
 	// If it's an inconsistent object we also just do MAP with the best-possible, recursively-merged value type
-	if (IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
+	if (map_inference_threshold != DConstants::INVALID_INDEX &&
+	    IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
 		return LogicalType::MAP(LogicalType::VARCHAR,
 		                        GetMergedType(context, node, max_depth, field_appearance_threshold,
 		                                      map_inference_threshold, depth + 1, null_type));

diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp
@@ -119,15 +119,15 @@ void CSVBufferManager::ResetBuffer(const idx_t buffer_idx) {
 	}
 }
 
-idx_t CSVBufferManager::GetBufferSize() {
+idx_t CSVBufferManager::GetBufferSize() const {
 	return buffer_size;
 }
 
-idx_t CSVBufferManager::BufferCount() {
+idx_t CSVBufferManager::BufferCount() const {
 	return cached_buffers.size();
 }
 
-bool CSVBufferManager::Done() {
+bool CSVBufferManager::Done() const {
 	return done;
 }
 
@@ -144,7 +144,7 @@ void CSVBufferManager::ResetBufferManager() {
 	}
 }
 
-string CSVBufferManager::GetFilePath() {
+string CSVBufferManager::GetFilePath() const {
 	return file_path;
 }
 

diff --git a/src/execution/operator/csv_scanner/scanner/base_scanner.cpp b/src/execution/operator/csv_scanner/scanner/base_scanner.cpp
@@ -1,6 +1,6 @@
 #include "duckdb/execution/operator/csv_scanner/base_scanner.hpp"
 
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"
 
 namespace duckdb {

diff --git a/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/execution/operator/csv_scanner/scanner/csv_schema.cpp
@@ -60,14 +60,53 @@ bool CSVSchema::Empty() const {
 	return columns.empty();
 }
 
-bool CSVSchema::SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
-                             const string &cur_file_path) {
-	D_ASSERT(names.size() == types.size());
+bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
+                             bool is_minimal_sniffer) const {
+	D_ASSERT(sniffer_result.names.size() == sniffer_result.return_types.size());
 	bool match = true;
 	unordered_map<string, TypeIdxPair> current_schema;
-	for (idx_t i = 0; i < names.size(); i++) {
+
+	for (idx_t i = 0; i < sniffer_result.names.size(); i++) {
 		// Populate our little schema
-		current_schema[names[i]] = {types[i], i};
+		current_schema[sniffer_result.names[i]] = {sniffer_result.return_types[i], i};
+	}
+	if (is_minimal_sniffer) {
+		auto min_sniffer = static_cast<AdaptiveSnifferResult &>(sniffer_result);
+		if (!min_sniffer.more_than_one_row) {
+			bool min_sniff_match = true;
+			// If we don't have more than one row, either the names must match or the types must match.
+			for (auto &column : columns) {
+				if (current_schema.find(column.name) == current_schema.end()) {
+					min_sniff_match = false;
+					break;
+				}
+			}
+			if (min_sniff_match) {
+				return true;
+			}
+			// Otherwise, the types must match.
+			min_sniff_match = true;
+			if (sniffer_result.return_types.size() == columns.size()) {
+				idx_t return_type_idx = 0;
+				for (auto &column : columns) {
+					if (column.type != sniffer_result.return_types[return_type_idx++]) {
+						min_sniff_match = false;
+						break;
+					}
+				}
+			} else {
+				min_sniff_match = false;
+			}
+			if (min_sniff_match) {
+				// If we got here, we have the right types but the wrong names, lets fix the names
+				idx_t sniff_name_idx = 0;
+				for (auto &column : columns) {
+					sniffer_result.names[sniff_name_idx++] = column.name;
+				}
+				return true;
+			}
+		}
+		// If we got to this point, the minimal sniffer doesn't match, we throw an error.
 	}
 	// Here we check if the schema of a given file matched our original schema
 	// We consider it's not a match if:

diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp
@@ -1,4 +1,4 @@
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "duckdb/common/types/value.hpp"
 
 namespace duckdb {
@@ -88,15 +88,14 @@ void CSVSniffer::SetResultOptions() {
 	options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header;
 }
 
-SnifferResult CSVSniffer::MinimalSniff() {
+AdaptiveSnifferResult CSVSniffer::MinimalSniff() {
 	if (set_columns.IsSet()) {
 		// Nothing to see here
-		return SnifferResult(*set_columns.types, *set_columns.names);
+		return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true);
 	}
 	// Return Types detected
 	vector<LogicalType> return_types;
 	// Column Names detected
-	vector<string> names;
 
 	buffer_manager->sniffing = true;
 	constexpr idx_t result_size = 2;
@@ -106,7 +105,8 @@ SnifferResult CSVSniffer::MinimalSniff() {
 	ColumnCountScanner count_scanner(buffer_manager, state_machine, error_handler, result_size);
 	auto &sniffed_column_counts = count_scanner.ParseChunk();
 	if (sniffed_column_counts.result_position == 0) {
-		return {{}, {}};
+		// The file is an empty file, we just return
+		return {{}, {}, false};
 	}
 
 	state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns;
@@ -130,20 +130,20 @@ SnifferResult CSVSniffer::MinimalSniff() {
 
 	// Possibly Gather Header
 	vector<HeaderValue> potential_header;
-	if (start_row != 0) {
-		for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
-			auto &cur_vector = data_chunk.data[col_idx];
-			auto vector_data = FlatVector::GetData<string_t>(cur_vector);
-			auto &validity = FlatVector::Validity(cur_vector);
-			HeaderValue val;
-			if (validity.RowIsValid(0)) {
-				val = HeaderValue(vector_data[0]);
-			}
-			potential_header.emplace_back(val);
+
+	for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
+		auto &cur_vector = data_chunk.data[col_idx];
+		auto vector_data = FlatVector::GetData<string_t>(cur_vector);
+		auto &validity = FlatVector::Validity(cur_vector);
+		HeaderValue val;
+		if (validity.RowIsValid(0)) {
+			val = HeaderValue(vector_data[0]);
 		}
+		potential_header.emplace_back(val);
 	}
-	names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
-	                             best_sql_types_candidates_per_column_idx, options, *error_handler);
+
+	vector<string> names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
+	                                            best_sql_types_candidates_per_column_idx, options, *error_handler);
 
 	for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
 		LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
@@ -153,34 +153,33 @@ SnifferResult CSVSniffer::MinimalSniff() {
 		detected_types.push_back(d_type);
 	}
 
-	return {detected_types, names};
+	return {detected_types, names, sniffed_column_counts.result_position > 1};
 }
 
-SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) {
+SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) {
 	auto min_sniff_res = MinimalSniff();
 	bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors();
 	// Check if we are happy with the result or if we need to do more sniffing
 	if (!error_handler->AnyErrors() && !detection_error_handler->AnyErrors()) {
 		// If we got no errors, we also run full if schemas do not match.
 		if (!set_columns.IsSet() && !options.file_options.AnySet()) {
 			string error;
-			run_full =
-			    !file_schema.SchemasMatch(error, min_sniff_res.names, min_sniff_res.return_types, options.file_path);
+			run_full = !file_schema.SchemasMatch(error, min_sniff_res, options.file_path, true);
 		}
 	}
 	if (run_full) {
 		// We run full sniffer
 		auto full_sniffer = SniffCSV();
 		if (!set_columns.IsSet() && !options.file_options.AnySet()) {
 			string error;
-			if (!file_schema.SchemasMatch(error, full_sniffer.names, full_sniffer.return_types, options.file_path) &&
+			if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) &&
 			    !options.ignore_errors.GetValue()) {
 				throw InvalidInputException(error);
 			}
 		}
 		return full_sniffer;
 	}
-	return min_sniff_res;
+	return min_sniff_res.ToSnifferResult();
 }
 SnifferResult CSVSniffer::SniffCSV(bool force_match) {
 	buffer_manager->sniffing = true;

diff --git a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp
@@ -1,5 +1,5 @@
 #include "duckdb/common/shared_ptr.hpp"
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "duckdb/main/client_data.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
 

diff --git a/src/execution/operator/csv_scanner/sniffer/header_detection.cpp b/src/execution/operator/csv_scanner/sniffer/header_detection.cpp
@@ -1,5 +1,5 @@
 #include "duckdb/common/types/cast_helpers.hpp"
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
 
 #include "utf8proc.hpp"

diff --git a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp
@@ -4,7 +4,7 @@
 #include "duckdb/common/operator/integer_cast_operator.hpp"
 #include "duckdb/common/string.hpp"
 #include "duckdb/common/types/time.hpp"
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 
 namespace duckdb {
 struct TryCastFloatingOperator {

diff --git a/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp b/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp
@@ -1,4 +1,4 @@
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"
 
 namespace duckdb {

diff --git a/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp b/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp
@@ -1,4 +1,4 @@
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 
 namespace duckdb {
 void CSVSniffer::ReplaceTypes() {

diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp
@@ -1,5 +1,5 @@
 #include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 #include "utf8proc_wrapper.hpp"
 #include "duckdb/main/error_manager.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"

diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp
@@ -1,6 +1,6 @@
 #include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
-#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
+#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
 
 namespace duckdb {