Skip to content

Commit

Permalink
Merge branch 'main' into fivetran_regression
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Nov 13, 2024
2 parents 1f22e76 + ca5af32 commit f51e463
Show file tree
Hide file tree
Showing 211 changed files with 5,492 additions and 810 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp
index 23482f1..968f116 100644
index 65eb34f..9b45db2 100644
--- a/src/functions/delta_scan.cpp
+++ b/src/functions/delta_scan.cpp
@@ -599,12 +599,12 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio
@@ -464,7 +464,11 @@ unique_ptr<MultiFileList> DeltaSnapshot::ComplexFilterPushdown(ClientContext &co
for (const auto &filter : filters) {
combiner.AddFilter(filter->Copy());
}
- auto filterstmp = combiner.GenerateTableScanFilters(info.column_ids);
+ vector<ColumnIndex> column_indexes;
+ for(auto column_id : info.column_ids) {
+ column_indexes.emplace_back(column_id);
+ }
+ auto filterstmp = combiner.GenerateTableScanFilters(column_indexes);

// TODO: can/should we figure out if this filtered anything?
auto filtered_list = make_uniq<DeltaSnapshot>(context, paths[0]);
@@ -529,7 +533,7 @@ unique_ptr<NodeStatistics> DeltaSnapshot::GetCardinality(ClientContext &context)
return nullptr;
}

-unique_ptr<MultiFileReader> DeltaMultiFileReader::CreateInstance() {
+unique_ptr<MultiFileReader> DeltaMultiFileReader::CreateInstance(const TableFunction &table_function) {
return std::move(make_uniq<DeltaMultiFileReader>());
}

@@ -618,12 +622,12 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio
}
}

Expand All @@ -18,12 +40,15 @@ index 23482f1..968f116 100644

// Generate the correct Selection Vector Based on the Raw delta KernelBoolSlice dv and the row_id_column
diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp
index 23c937d..84220f9 100644
index aac35cc..84220f9 100644
--- a/src/include/functions/delta_scan.hpp
+++ b/src/include/functions/delta_scan.hpp
@@ -105,7 +105,7 @@ struct DeltaMultiFileReaderGlobalState : public MultiFileReaderGlobalState {
@@ -103,9 +103,9 @@ struct DeltaMultiFileReaderGlobalState : public MultiFileReaderGlobalState {
};

struct DeltaMultiFileReader : public MultiFileReader {
static unique_ptr<MultiFileReader> CreateInstance(const TableFunction &table_function);
- static unique_ptr<MultiFileReader> CreateInstance();
+ static unique_ptr<MultiFileReader> CreateInstance(const TableFunction &table_function);
//! Return a DeltaSnapshot
- unique_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
+ shared_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
Expand Down

This file was deleted.

48 changes: 44 additions & 4 deletions .github/patches/extensions/spatial/random_test_fix.patch
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,51 @@ index 007d386..8754619 100644

state.current_idx++;
}
diff --git a/spatial/src/spatial/core/index/rtree/rtree_index_plan_scan.cpp b/spatial/src/spatial/core/index/rtree/rtree_index_plan_scan.cpp
index b420233..f904fdd 100644
--- a/spatial/src/spatial/core/index/rtree/rtree_index_plan_scan.cpp
+++ b/spatial/src/spatial/core/index/rtree/rtree_index_plan_scan.cpp
@@ -46,7 +46,7 @@ public:
column_t referenced_column = column_ids[bound_colref.binding.column_index];
// search for the referenced column in the set of column_ids
for (idx_t i = 0; i < get_column_ids.size(); i++) {
- if (get_column_ids[i] == referenced_column) {
+ if (get_column_ids[i].GetPrimaryIndex() == referenced_column) {
bound_colref.binding.column_index = i;
return;
}
@@ -213,7 +213,7 @@ public:
auto &type = get.returned_types[column_id];
bool found = false;
for (idx_t i = 0; i < column_ids.size(); i++) {
- if (column_ids[i] == column_id) {
+ if (column_ids[i].GetPrimaryIndex() == column_id) {
column_id = i;
found = true;
break;
diff --git a/spatial/src/spatial/core/index/rtree/rtree_index_scan.cpp b/spatial/src/spatial/core/index/rtree/rtree_index_scan.cpp
index 9168790..7fd53a2 100644
index 01f2966..0fabe44 100644
--- a/spatial/src/spatial/core/index/rtree/rtree_index_scan.cpp
+++ b/spatial/src/spatial/core/index/rtree/rtree_index_scan.cpp
@@ -208,7 +208,6 @@ TableFunction RTreeIndexScanFunction::GetFunction() {
@@ -31,7 +31,7 @@ BindInfo RTreeIndexScanBindInfo(const optional_ptr<FunctionData> bind_data_p) {
struct RTreeIndexScanGlobalState : public GlobalTableFunctionState {
ColumnFetchState fetch_state;
TableScanState local_storage_state;
- vector<storage_t> column_ids;
+ vector<StorageIndex> column_ids;

// Index scan state
unique_ptr<IndexScanState> index_state;
@@ -54,7 +54,7 @@ static unique_ptr<GlobalTableFunctionState> RTreeIndexScanInitGlobal(ClientConte
if (id != DConstants::INVALID_INDEX) {
col_id = bind_data.table.GetColumn(LogicalIndex(id)).StorageOid();
}
- result->column_ids.push_back(col_id);
+ result->column_ids.emplace_back(col_id);
}

// Initialize the storage scan state
@@ -205,7 +205,6 @@ TableFunction RTreeIndexScanFunction::GetFunction() {
func.pushdown_complex_filter = nullptr;
func.to_string = RTreeIndexScanToString;
func.table_scan_progress = nullptr;
Expand Down Expand Up @@ -56,10 +96,10 @@ index 465cb87..5aa49dd 100644

ExtensionUtil::RegisterFunction(db, read);
diff --git a/spatial/src/spatial/gdal/functions/st_read.cpp b/spatial/src/spatial/gdal/functions/st_read.cpp
index b730baa..8d08898 100644
index 9bd92e8..600fad8 100644
--- a/spatial/src/spatial/gdal/functions/st_read.cpp
+++ b/spatial/src/spatial/gdal/functions/st_read.cpp
@@ -676,7 +676,7 @@ void GdalTableFunction::Register(DatabaseInstance &db) {
@@ -675,7 +675,7 @@ void GdalTableFunction::Register(DatabaseInstance &db) {
GdalTableFunction::InitGlobal, GdalTableFunction::InitLocal);

scan.cardinality = GdalTableFunction::Cardinality;
Expand Down
11 changes: 10 additions & 1 deletion .github/patches/extensions/substrait/or_filter_pushdown.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/src/to_substrait.cpp b/src/to_substrait.cpp
index a466fb2..44a9923 100644
index a466fb2..0dd0766 100644
--- a/src/to_substrait.cpp
+++ b/src/to_substrait.cpp
@@ -1296,6 +1296,17 @@ substrait::Rel *DuckDBToSubstrait::TransformGet(LogicalOperator &dop) {
Expand All @@ -20,3 +20,12 @@ index a466fb2..44a9923 100644
if (!dget.table_filters.filters.empty()) {
// Pushdown filter
auto filter = CreateConjunction(dget.table_filters.filters,
@@ -1317,7 +1328,7 @@ substrait::Rel *DuckDBToSubstrait::TransformGet(LogicalOperator &dop) {
auto &column_ids = dget.GetColumnIds();
for (auto col_idx : dget.projection_ids) {
auto struct_item = select->add_struct_items();
- struct_item->set_field(static_cast<int32_t>(column_ids[col_idx]));
+ struct_item->set_field(static_cast<int32_t>(column_ids[col_idx].GetPrimaryIndex()));
// FIXME do we need to set the child? if yes, to what?
}
projection->set_allocated_select(select);
135 changes: 134 additions & 1 deletion .github/patches/extensions/vss/partitioning.patch
Original file line number Diff line number Diff line change
@@ -1,7 +1,47 @@
diff --git a/src/hnsw/hnsw_index.cpp b/src/hnsw/hnsw_index.cpp
index b8bfd1f..50cb165 100644
--- a/src/hnsw/hnsw_index.cpp
+++ b/src/hnsw/hnsw_index.cpp
@@ -581,7 +581,7 @@ void HNSWIndex::VerifyAllocations(IndexLock &state) {
// Can rewrite index expression?
//------------------------------------------------------------------------------
static void TryBindIndexExpressionInternal(Expression &expr, idx_t table_idx, const vector<column_t> &index_columns,
- const vector<column_t> &table_columns, bool &success, bool &found) {
+ const vector<ColumnIndex> &table_columns, bool &success, bool &found) {

if (expr.type == ExpressionType::BOUND_COLUMN_REF) {
found = true;
@@ -592,7 +592,7 @@ static void TryBindIndexExpressionInternal(Expression &expr, idx_t table_idx, co

const auto referenced_column = index_columns[ref.binding.column_index];
for (idx_t i = 0; i < table_columns.size(); i++) {
- if (table_columns[i] == referenced_column) {
+ if (table_columns[i].GetPrimaryIndex() == referenced_column) {
ref.binding.column_index = i;
return;
}
diff --git a/src/hnsw/hnsw_index_scan.cpp b/src/hnsw/hnsw_index_scan.cpp
index bd4826c..746e22d 100644
index bd4826c..16f4953 100644
--- a/src/hnsw/hnsw_index_scan.cpp
+++ b/src/hnsw/hnsw_index_scan.cpp
@@ -29,7 +29,7 @@ BindInfo HNSWIndexScanBindInfo(const optional_ptr<FunctionData> bind_data_p) {
struct HNSWIndexScanGlobalState : public GlobalTableFunctionState {
ColumnFetchState fetch_state;
TableScanState local_storage_state;
- vector<storage_t> column_ids;
+ vector<StorageIndex> column_ids;

// Index scan state
unique_ptr<IndexScanState> index_state;
@@ -52,7 +52,7 @@ static unique_ptr<GlobalTableFunctionState> HNSWIndexScanInitGlobal(ClientContex
if (id != DConstants::INVALID_INDEX) {
col_id = bind_data.table.GetColumn(LogicalIndex(id)).StorageOid();
}
- result->column_ids.push_back(col_id);
+ result->column_ids.emplace_back(col_id);
}

// Initialize the storage scan state
@@ -141,7 +141,6 @@ TableFunction HNSWIndexScanFunction::GetFunction() {
func.pushdown_complex_filter = nullptr;
func.to_string = HNSWIndexScanToString;
Expand All @@ -10,3 +50,96 @@ index bd4826c..746e22d 100644
func.projection_pushdown = true;
func.filter_pushdown = false;
func.get_bind_info = HNSWIndexScanBindInfo;
diff --git a/src/hnsw/hnsw_optimize_join.cpp b/src/hnsw/hnsw_optimize_join.cpp
index fb79fdf..9201a3b 100644
--- a/src/hnsw/hnsw_optimize_join.cpp
+++ b/src/hnsw/hnsw_optimize_join.cpp
@@ -19,6 +19,7 @@
#include "duckdb/planner/expression_iterator.hpp"
#include "duckdb/storage/table/scan_state.hpp"
#include "duckdb/transaction/duck_transaction.hpp"
+#include "duckdb/storage/storage_index.hpp"

#include "hnsw/hnsw.hpp"
#include "hnsw/hnsw_index.hpp"
@@ -74,7 +75,7 @@ public:

ColumnFetchState fetch_state;
TableScanState local_storage_state;
- vector<storage_t> phyiscal_column_ids;
+ vector<StorageIndex> physical_column_ids;

// Index scan state
unique_ptr<IndexScanState> index_state;
@@ -85,7 +86,7 @@ unique_ptr<OperatorState> PhysicalHNSWIndexJoin::GetOperatorState(ExecutionConte
auto result = make_uniq<HNSWIndexJoinState>();

auto &local_storage = LocalStorage::Get(context.client, table.catalog);
- result->phyiscal_column_ids.reserve(inner_column_ids.size());
+ result->physical_column_ids.reserve(inner_column_ids.size());

// Figure out the storage column ids from the projection expression
for (auto &id : inner_column_ids) {
@@ -93,14 +94,14 @@ unique_ptr<OperatorState> PhysicalHNSWIndexJoin::GetOperatorState(ExecutionConte
if (id != DConstants::INVALID_INDEX) {
col_id = table.GetColumn(LogicalIndex(id)).StorageOid();
}
- result->phyiscal_column_ids.push_back(col_id);
+ result->physical_column_ids.emplace_back(col_id);
}

// Initialize selection vector
result->match_sel.Initialize();

// Initialize the storage scan state
- result->local_storage_state.Initialize(result->phyiscal_column_ids, nullptr);
+ result->local_storage_state.Initialize(result->physical_column_ids, nullptr);
local_storage.InitializeScan(table.GetStorage(), result->local_storage_state.local_state, nullptr);

// Initialize the index scan state
@@ -152,7 +153,7 @@ OperatorResultType PhysicalHNSWIndexJoin::Execute(ExecutionContext &context, Dat
const auto &row_ids = hnsw_index.GetMultiScanResult(*state.index_state);

// Execute one big fetch for the LHS
- table.GetStorage().Fetch(transcation, chunk, state.phyiscal_column_ids, row_ids, output_idx, state.fetch_state);
+ table.GetStorage().Fetch(transcation, chunk, state.physical_column_ids, row_ids, output_idx, state.fetch_state);

// Now slice the chunk so that we include the rhs too
chunk.Slice(input, state.match_sel, output_idx, OUTER_COLUMN_OFFSET);
@@ -573,7 +574,9 @@ bool HNSWIndexJoinOptimizer::TryOptimize(Binder &binder, ClientContext &context,
//------------------------------------------------------------------------------

auto index_join = make_uniq<LogicalHNSWIndexJoin>(binder.GenerateTableIndex(), duck_table, *index_ptr, k_value);
- index_join->inner_column_ids = inner_get.GetColumnIds();
+ for(auto &column_id : inner_get.GetColumnIds()) {
+ index_join->inner_column_ids.emplace_back(column_id.GetPrimaryIndex());
+ }
index_join->inner_projection_ids = inner_get.projection_ids;
index_join->inner_returned_types = inner_get.returned_types;

diff --git a/src/hnsw/hnsw_optimize_scan.cpp b/src/hnsw/hnsw_optimize_scan.cpp
index 28cee3a..d5aded4 100644
--- a/src/hnsw/hnsw_optimize_scan.cpp
+++ b/src/hnsw/hnsw_optimize_scan.cpp
@@ -170,7 +170,7 @@ public:
auto &type = get.returned_types[column_id];
bool found = false;
for (idx_t i = 0; i < column_ids.size(); i++) {
- if (column_ids[i] == column_id) {
+ if (column_ids[i].GetPrimaryIndex() == column_id) {
column_id = i;
found = true;
break;
diff --git a/src/hnsw/hnsw_optimize_topk.cpp b/src/hnsw/hnsw_optimize_topk.cpp
index 6f78cea..14967d3 100644
--- a/src/hnsw/hnsw_optimize_topk.cpp
+++ b/src/hnsw/hnsw_optimize_topk.cpp
@@ -198,7 +198,7 @@ public:
auto &type = get.returned_types[column_id];
bool found = false;
for (idx_t i = 0; i < column_ids.size(); i++) {
- if (column_ids[i] == column_id) {
+ if (column_ids[i].GetPrimaryIndex() == column_id) {
column_id = i;
found = true;
break;
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ generate-files:
python3 scripts/generate_settings.py
python3 scripts/generate_serialization.py
python3 scripts/generate_enum_util.py
python3 scripts/generate_metric_enums.py
-@python3 tools/pythonpkg/scripts/generate_connection_code.py || echo "Warning: generate_connection_code.py failed, cxxheaderparser & pcpp are required to perform this step"
# Run the formatter again after (re)generating the files
$(MAKE) format-main
Expand Down
18 changes: 18 additions & 0 deletions benchmark/tpch/struct/tpch_q1_struct.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# name: benchmark/tpch/struct/tpch_q1_struct.benchmark
# description: Run Q01 over lineitem stored in structs
# group: [struct]

name Q01 Structs
group tpch
subgroup sf1

require tpch

load
CALL dbgen(sf=1, suffix='_normalized');
CREATE TABLE lineitem_struct AS SELECT lineitem_normalized AS struct_val FROM lineitem_normalized;
CREATE VIEW lineitem AS SELECT UNNEST(struct_val) FROM lineitem_struct;

run extension/tpch/dbgen/queries/q01.sql

result extension/tpch/dbgen/answers/sf1/q01.csv
18 changes: 18 additions & 0 deletions benchmark/tpch/struct/tpch_q1_struct_nested.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# name: benchmark/tpch/struct/tpch_q1_struct_nested.benchmark
# description: Run Q01 over lineitem stored in nested structs
# group: [struct]

name Q01 Structs
group tpch
subgroup sf1

require tpch

load
CALL dbgen(sf=1, suffix='_normalized');
CREATE TABLE lineitem_struct AS SELECT {'id': rowid, 'values': lineitem_normalized} AS struct_val FROM lineitem_normalized;
CREATE VIEW lineitem AS SELECT UNNEST(struct_val, recursive := true) FROM lineitem_struct;

run extension/tpch/dbgen/queries/q01.sql

result extension/tpch/dbgen/answers/sf1/q01.csv
Binary file added data/csv/null_terminator.csv
Binary file not shown.
2 changes: 2 additions & 0 deletions extension/parquet/parquet_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_p
if (duckdb_type.IsJSONType()) {
schema_ele.converted_type = ConvertedType::JSON;
schema_ele.__isset.converted_type = true;
schema_ele.__isset.logicalType = true;
schema_ele.logicalType.__set_JSON(duckdb_parquet::JsonType());
return;
}
switch (duckdb_type.id()) {
Expand Down
3 changes: 3 additions & 0 deletions scripts/generate_metric_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
"OPERATOR_ROWS_SCANNED",
"OPERATOR_TIMING",
"RESULT_SET_SIZE",
"LATENCY",
"ROWS_RETURNED",
"OPERATOR_NAME",
]

phase_timing_metrics = [
Expand Down
6 changes: 3 additions & 3 deletions src/catalog/catalog_entry/duck_table_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -671,12 +671,12 @@ unique_ptr<CatalogEntry> DuckTableEntry::ChangeColumnType(ClientContext &context

auto bound_create_info = binder->BindCreateTableInfo(std::move(create_info), schema);

vector<column_t> storage_oids;
vector<StorageIndex> storage_oids;
for (idx_t i = 0; i < bound_columns.size(); i++) {
storage_oids.push_back(columns.LogicalToPhysical(bound_columns[i]).index);
storage_oids.emplace_back(columns.LogicalToPhysical(bound_columns[i]).index);
}
if (storage_oids.empty()) {
storage_oids.push_back(COLUMN_IDENTIFIER_ROW_ID);
storage_oids.emplace_back(COLUMN_IDENTIFIER_ROW_ID);
}

auto new_storage =
Expand Down
Loading

0 comments on commit f51e463

Please sign in to comment.