Skip to content

Commit

Permalink
merge with feature
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Oct 15, 2024
2 parents 9e782f1 + b706529 commit 5e4f2e1
Show file tree
Hide file tree
Showing 40 changed files with 353 additions and 100 deletions.
2 changes: 1 addition & 1 deletion .github/config/distribution_matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"vcpkg_triplet": "x64-windows-static-md"
},
{
"duckdb_arch": "windows_amd64_rtools",
"duckdb_arch": "windows_amd64_mingw",
"vcpkg_triplet": "x64-mingw-static"
}
]
Expand Down
12 changes: 6 additions & 6 deletions .github/config/out_of_tree_extensions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ if (NOT MINGW)
duckdb_extension_load(azure
LOAD_TESTS
GIT_URL https://github.com/duckdb/duckdb_azure
GIT_TAG b0ffe7ada20cdbd0bee2bbe5461ecd22fb468062
GIT_TAG a40ecb7bc9036eb8ecc5bf30db935a31b78011f5
APPLY_PATCHES
)
endif()
Expand All @@ -51,7 +51,7 @@ if (NOT MINGW AND NOT "${OS_NAME}" STREQUAL "linux")
duckdb_extension_load(delta
LOAD_TESTS
GIT_URL https://github.com/duckdb/duckdb_delta
GIT_TAG 3933ebd800ad06a64656c9aef6ca7d62897fa4db
GIT_TAG 811db25f5bd405dea186d6c461a642a387502ad8
APPLY_PATCHES
)
endif()
Expand All @@ -76,7 +76,7 @@ if (NOT MINGW)
duckdb_extension_load(iceberg
${LOAD_ICEBERG_TESTS}
GIT_URL https://github.com/duckdb/duckdb_iceberg
GIT_TAG 3f6d753787252e3da1d12157910b62edf729fc6e
GIT_TAG 8b48d1261564613274ac8e9fae01e572d965c99d
APPLY_PATCHES
)
endif()
Expand Down Expand Up @@ -106,7 +106,7 @@ endif()
duckdb_extension_load(spatial
DONT_LINK LOAD_TESTS
GIT_URL https://github.com/duckdb/duckdb_spatial.git
GIT_TAG bb9c829693965f029eb5a312aefed4c538fad781
GIT_TAG 3f94d52aa9f7d67b1a30e6cea642bbb790c04aa2
INCLUDE_DIR spatial/include
TEST_DIR test/sql
APPLY_PATCHES
Expand All @@ -123,7 +123,7 @@ endif()
duckdb_extension_load(sqlite_scanner
${STATIC_LINK_SQLITE} LOAD_TESTS
GIT_URL https://github.com/duckdb/sqlite_scanner
GIT_TAG 315861963c8106397af36cbda10faebc8dae485a
GIT_TAG d5d62657702d33cb44a46cddc7ffc4b67bf7e961
APPLY_PATCHES
)

Expand All @@ -149,7 +149,7 @@ duckdb_extension_load(vss
LOAD_TESTS
DONT_LINK
GIT_URL https://github.com/duckdb/duckdb_vss
GIT_TAG 77739ea5382cce3220af83803ac0b1e98b3ab7d8
GIT_TAG dd880d6121c0f3dff27131e54e057c9db0f1c710
TEST_DIR test/sql
APPLY_PATCHES
)
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/R.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ jobs:

- uses: ./.github/actions/build_extensions
with:
deploy_as: windows_amd64_rtools
duckdb_arch: windows_amd64_rtools
deploy_as: windows_amd64_mingw
duckdb_arch: windows_amd64_mingw
vcpkg_target_triplet: x64-mingw-static
treat_warn_as_error: 0
s3_id: ${{ secrets.S3_ID }}
Expand Down
3 changes: 3 additions & 0 deletions data/csv/glob_dif_dialect/14166/__2000.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
date_col,int_col,double_col
2000-01-01,10,80.9189441112103
2000-01-02,5,109.16581782022259
1 change: 1 addition & 0 deletions data/csv/glob_dif_dialect/14166/__2001.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
date_col,int_col,double_col
Empty file.
1 change: 1 addition & 0 deletions data/csv/glob_dif_dialect/14166/matching_types.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2003-01-02,5,109.16581782022259
18 changes: 12 additions & 6 deletions extension/core_functions/scalar/date/date_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ struct DateDiff {
});
}

// We need to truncate down, not towards 0
static inline int64_t Truncate(int64_t value, int64_t units) {
return (value + (value < 0)) / units - (value < 0);
}
static inline int64_t Diff(int64_t start, int64_t end, int64_t units) {
return Truncate(end, units) - Truncate(start, units);
}

struct YearOperator {
template <class TA, class TB, class TR>
static inline TR Operation(TA startdate, TB enddate) {
Expand Down Expand Up @@ -204,30 +212,28 @@ template <>
int64_t DateDiff::MillisecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
D_ASSERT(Timestamp::IsFinite(startdate));
D_ASSERT(Timestamp::IsFinite(enddate));
return Timestamp::GetEpochMs(enddate) - Timestamp::GetEpochMs(startdate);
return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MSEC);
}

template <>
int64_t DateDiff::SecondsOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
D_ASSERT(Timestamp::IsFinite(startdate));
D_ASSERT(Timestamp::IsFinite(enddate));
return Timestamp::GetEpochSeconds(enddate) - Timestamp::GetEpochSeconds(startdate);
return Diff(startdate.value, enddate.value, Interval::MICROS_PER_SEC);
}

template <>
int64_t DateDiff::MinutesOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
D_ASSERT(Timestamp::IsFinite(startdate));
D_ASSERT(Timestamp::IsFinite(enddate));
return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_MINUTE -
Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_MINUTE;
return Diff(startdate.value, enddate.value, Interval::MICROS_PER_MINUTE);
}

template <>
int64_t DateDiff::HoursOperator::Operation(timestamp_t startdate, timestamp_t enddate) {
D_ASSERT(Timestamp::IsFinite(startdate));
D_ASSERT(Timestamp::IsFinite(enddate));
return Timestamp::GetEpochSeconds(enddate) / Interval::SECS_PER_HOUR -
Timestamp::GetEpochSeconds(startdate) / Interval::SECS_PER_HOUR;
return Diff(startdate.value, enddate.value, Interval::MICROS_PER_HOUR);
}

// TIME specialisations
Expand Down
3 changes: 2 additions & 1 deletion extension/json/json_functions/json_structure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,8 @@ static LogicalType StructureToTypeObject(ClientContext &context, const JSONStruc
}

// If it's an inconsistent object we also just do MAP with the best-possible, recursively-merged value type
if (IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
if (map_inference_threshold != DConstants::INVALID_INDEX &&
IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
return LogicalType::MAP(LogicalType::VARCHAR,
GetMergedType(context, node, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, null_type));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,15 @@ void CSVBufferManager::ResetBuffer(const idx_t buffer_idx) {
}
}

idx_t CSVBufferManager::GetBufferSize() {
idx_t CSVBufferManager::GetBufferSize() const {
return buffer_size;
}

idx_t CSVBufferManager::BufferCount() {
idx_t CSVBufferManager::BufferCount() const {
return cached_buffers.size();
}

bool CSVBufferManager::Done() {
bool CSVBufferManager::Done() const {
return done;
}

Expand All @@ -144,7 +144,7 @@ void CSVBufferManager::ResetBufferManager() {
}
}

string CSVBufferManager::GetFilePath() {
string CSVBufferManager::GetFilePath() const {
return file_path;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/base_scanner.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"

namespace duckdb {
Expand Down
49 changes: 44 additions & 5 deletions src/execution/operator/csv_scanner/scanner/csv_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,53 @@ bool CSVSchema::Empty() const {
return columns.empty();
}

bool CSVSchema::SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
const string &cur_file_path) {
D_ASSERT(names.size() == types.size());
bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
bool is_minimal_sniffer) const {
D_ASSERT(sniffer_result.names.size() == sniffer_result.return_types.size());
bool match = true;
unordered_map<string, TypeIdxPair> current_schema;
for (idx_t i = 0; i < names.size(); i++) {

for (idx_t i = 0; i < sniffer_result.names.size(); i++) {
// Populate our little schema
current_schema[names[i]] = {types[i], i};
current_schema[sniffer_result.names[i]] = {sniffer_result.return_types[i], i};
}
if (is_minimal_sniffer) {
auto min_sniffer = static_cast<AdaptiveSnifferResult &>(sniffer_result);
if (!min_sniffer.more_than_one_row) {
bool min_sniff_match = true;
// If we don't have more than one row, either the names must match or the types must match.
for (auto &column : columns) {
if (current_schema.find(column.name) == current_schema.end()) {
min_sniff_match = false;
break;
}
}
if (min_sniff_match) {
return true;
}
// Otherwise, the types must match.
min_sniff_match = true;
if (sniffer_result.return_types.size() == columns.size()) {
idx_t return_type_idx = 0;
for (auto &column : columns) {
if (column.type != sniffer_result.return_types[return_type_idx++]) {
min_sniff_match = false;
break;
}
}
} else {
min_sniff_match = false;
}
if (min_sniff_match) {
// If we got here, we have the right types but the wrong names, lets fix the names
idx_t sniff_name_idx = 0;
for (auto &column : columns) {
sniffer_result.names[sniff_name_idx++] = column.name;
}
return true;
}
}
// If we got to this point, the minimal sniffer doesn't match, we throw an error.
}
// Here we check if the schema of a given file matched our original schema
// We consider it's not a match if:
Expand Down
45 changes: 22 additions & 23 deletions src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/common/types/value.hpp"

namespace duckdb {
Expand Down Expand Up @@ -88,15 +88,14 @@ void CSVSniffer::SetResultOptions() {
options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header;
}

SnifferResult CSVSniffer::MinimalSniff() {
AdaptiveSnifferResult CSVSniffer::MinimalSniff() {
if (set_columns.IsSet()) {
// Nothing to see here
return SnifferResult(*set_columns.types, *set_columns.names);
return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true);
}
// Return Types detected
vector<LogicalType> return_types;
// Column Names detected
vector<string> names;

buffer_manager->sniffing = true;
constexpr idx_t result_size = 2;
Expand All @@ -106,7 +105,8 @@ SnifferResult CSVSniffer::MinimalSniff() {
ColumnCountScanner count_scanner(buffer_manager, state_machine, error_handler, result_size);
auto &sniffed_column_counts = count_scanner.ParseChunk();
if (sniffed_column_counts.result_position == 0) {
return {{}, {}};
// The file is an empty file, we just return
return {{}, {}, false};
}

state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns;
Expand All @@ -130,20 +130,20 @@ SnifferResult CSVSniffer::MinimalSniff() {

// Possibly Gather Header
vector<HeaderValue> potential_header;
if (start_row != 0) {
for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);

for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);
}
names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

vector<string> names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
Expand All @@ -153,34 +153,33 @@ SnifferResult CSVSniffer::MinimalSniff() {
detected_types.push_back(d_type);
}

return {detected_types, names};
return {detected_types, names, sniffed_column_counts.result_position > 1};
}

SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) {
SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) {
auto min_sniff_res = MinimalSniff();
bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors();
// Check if we are happy with the result or if we need to do more sniffing
if (!error_handler->AnyErrors() && !detection_error_handler->AnyErrors()) {
// If we got no errors, we also run full if schemas do not match.
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
run_full =
!file_schema.SchemasMatch(error, min_sniff_res.names, min_sniff_res.return_types, options.file_path);
run_full = !file_schema.SchemasMatch(error, min_sniff_res, options.file_path, true);
}
}
if (run_full) {
// We run full sniffer
auto full_sniffer = SniffCSV();
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
if (!file_schema.SchemasMatch(error, full_sniffer.names, full_sniffer.return_types, options.file_path) &&
if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) &&
!options.ignore_errors.GetValue()) {
throw InvalidInputException(error);
}
}
return full_sniffer;
}
return min_sniff_res;
return min_sniff_res.ToSnifferResult();
}
SnifferResult CSVSniffer::SniffCSV(bool force_match) {
buffer_manager->sniffing = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/shared_ptr.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/main/client_data.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/types/cast_helpers.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

#include "utf8proc.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "duckdb/common/operator/integer_cast_operator.hpp"
#include "duckdb/common/string.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
struct TryCastFloatingOperator {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"

namespace duckdb {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
void CSVSniffer::ReplaceTypes() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "utf8proc_wrapper.hpp"
#include "duckdb/main/error_manager.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {

Expand Down
Loading

0 comments on commit 5e4f2e1

Please sign in to comment.