merge with feature and set proper lowest zstd level

lnkuiper · Oct 18, 2024 · 36885cc · 36885cc
2 parents ffefbe0 + 1979504
commit 36885cc
Show file tree

Hide file tree

Showing 204 changed files with 36,459 additions and 12,231 deletions.
diff --git a/.github/workflows/CodeQuality.yml b/.github/workflows/CodeQuality.yml
@@ -117,14 +117,14 @@ jobs:
         run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build clang-tidy && sudo pip3 install pybind11[global] --break-system-packages
 
       - name: Setup Ccache
-        if: ${{ github.ref == 'refs/heads/main' }}
+        if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
         uses: hendrikmuhs/ccache-action@main
         with:
           key: ${{ github.job }}
           save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }}
 
       - name: Download clang-tidy-cache
-        if: ${{ github.ref == 'refs/heads/main' }}
+        if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
         shell: bash
         run: |
           set -e
@@ -134,10 +134,10 @@ jobs:
 
       - name: Tidy Check
         shell: bash
-        if: ${{ github.ref == 'refs/heads/main' }}
+        if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
         run: make tidy-check TIDY_BINARY=/tmp/clang-tidy-cache
 
       - name: Tidy Check Diff
         shell: bash
-        if: ${{ github.ref != 'refs/heads/main' }}
+        if: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/feature' }}
         run: make tidy-check-diff
diff --git a/.github/workflows/Regression.yml b/.github/workflows/Regression.yml
@@ -222,46 +222,6 @@ jobs:
             echo "Storage bump detected, all good!"
           fi
 
-  regression-test-binary-size:
-    name: Regression test binary size
-    runs-on: ubuntu-20.04
-    env:
-      CC: gcc-10
-      CXX: g++-10
-      GEN: ninja
-      BUILD_TPCH: 1
-      BUILD_TPCDS: 1
-      BUILD_JSON: 1
-      BUILD_PARQUET: 1
-      EXTENSION_STATIC_BUILD: 1
-  steps:
-    - uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.12'
-
-    - name: Install
-      shell: bash
-      run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build && pip install requests
-
-    - name: Setup Ccache
-      uses: hendrikmuhs/ccache-action@main
-      with:
-        key: ${{ github.job }}
-        save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }}
-
-    - name: Build
-      shell: bash
-      run: |
-        make
-        git clone --branch ${{ env.BASE_BRANCH }} https://github.com/duckdb/duckdb.git --depth=1
-        cd duckdb
-        make
-        cd ..
-
   regression-test-binary-size:
     name: Regression test binary size
     runs-on: ubuntu-20.04

diff --git a/extension/core_functions/core_functions_extension.cpp b/extension/core_functions/core_functions_extension.cpp
@@ -9,7 +9,7 @@
 namespace duckdb {
 
 template <class T>
-void FillExtraInfo(const StaticFunctionDefinition &function, T &info) {
+static void FillExtraInfo(const StaticFunctionDefinition &function, T &info) {
 	info.internal = true;
 	info.description = function.description;
 	info.parameter_names = StringUtil::Split(function.parameters, ",");

diff --git a/extension/core_functions/function_list.cpp b/extension/core_functions/function_list.cpp
@@ -47,7 +47,7 @@ namespace duckdb {
 	{ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }
 
 // this list is generated by scripts/generate_functions.py
-static const StaticFunctionDefinition internal_functions[] = {
+static const StaticFunctionDefinition core_functions[] = {
 	DUCKDB_SCALAR_FUNCTION(FactorialOperatorFun),
 	DUCKDB_SCALAR_FUNCTION_SET(BitwiseAndFun),
 	DUCKDB_SCALAR_FUNCTION_ALIAS(ListHasAnyFunAlias),
@@ -400,7 +400,7 @@ static const StaticFunctionDefinition internal_functions[] = {
 };
 
 const StaticFunctionDefinition *StaticFunctionDefinition::GetFunctionList() {
-	return internal_functions;
+	return core_functions;
 }
 
 } // namespace duckdb
diff --git a/extension/core_functions/scalar/list/array_slice.cpp b/extension/core_functions/scalar/list/array_slice.cpp
@@ -60,7 +60,7 @@ static idx_t CalculateSliceLength(idx_t begin, idx_t end, INDEX_TYPE step, bool
 
 struct BlobSliceOperations {
 	static int64_t ValueLength(const string_t &value) {
-		return value.GetSize();
+		return UnsafeNumericCast<int64_t>(value.GetSize());
 	}
 
 	static string_t SliceValue(Vector &result, string_t input, int64_t begin, int64_t end) {

diff --git a/extension/parquet/CMakeLists.txt b/extension/parquet/CMakeLists.txt
@@ -31,7 +31,6 @@ if(NOT CLANG_TIDY)
   # parquet/thrift/snappy
   set(PARQUET_EXTENSION_FILES
       ${PARQUET_EXTENSION_FILES}
-      ../../third_party/parquet/parquet_constants.cpp
       ../../third_party/parquet/parquet_types.cpp
       ../../third_party/thrift/thrift/protocol/TProtocol.cpp
       ../../third_party/thrift/thrift/transport/TTransportException.cpp

diff --git a/extension/parquet/column_reader.cpp b/extension/parquet/column_reader.cpp
@@ -29,11 +29,11 @@
 
 namespace duckdb {
 
-using duckdb_parquet::format::CompressionCodec;
-using duckdb_parquet::format::ConvertedType;
-using duckdb_parquet::format::Encoding;
-using duckdb_parquet::format::PageType;
-using duckdb_parquet::format::Type;
+using duckdb_parquet::CompressionCodec;
+using duckdb_parquet::ConvertedType;
+using duckdb_parquet::Encoding;
+using duckdb_parquet::PageType;
+using duckdb_parquet::Type;
 
 const uint64_t ParquetDecodeUtils::BITPACK_MASKS[] = {0,
                                                       1,
@@ -237,6 +237,10 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 	block.reset();
 	PageHeader page_hdr;
 	reader.Read(page_hdr, *protocol);
+	// some basic sanity check
+	if (page_hdr.compressed_page_size < 0 || page_hdr.uncompressed_page_size < 0) {
+		throw std::runtime_error("Page sizes can't be < 0");
+	}
 
 	switch (page_hdr.type) {
 	case PageType::DATA_PAGE_V2:
@@ -277,7 +281,6 @@ void ColumnReader::ResetPage() {
 
 void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
 	D_ASSERT(page_hdr.type == PageType::DATA_PAGE_V2);
-
 	auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());
 
 	AllocateBlock(page_hdr.uncompressed_page_size + 1);
@@ -299,6 +302,10 @@ void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
 	// copy repeats & defines as-is because FOR SOME REASON they are uncompressed
 	auto uncompressed_bytes = page_hdr.data_page_header_v2.repetition_levels_byte_length +
 	                          page_hdr.data_page_header_v2.definition_levels_byte_length;
+	if (uncompressed_bytes > page_hdr.uncompressed_page_size) {
+		throw std::runtime_error("Page header inconsistency, uncompressed_page_size needs to be larger than "
+		                         "repetition_levels_byte_length + definition_levels_byte_length");
+	}
 	trans.read(block->ptr, uncompressed_bytes);
 
 	auto compressed_bytes = page_hdr.compressed_page_size - uncompressed_bytes;
@@ -574,12 +581,12 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 			auto read_buf = make_shared_ptr<ResizeableBuffer>();
 
 			switch (schema.type) {
-			case duckdb_parquet::format::Type::INT32:
+			case duckdb_parquet::Type::INT32:
 				read_buf->resize(reader.allocator, sizeof(int32_t) * (read_now - null_count));
 				dbp_decoder->GetBatch<int32_t>(read_buf->ptr, read_now - null_count);
 
 				break;
-			case duckdb_parquet::format::Type::INT64:
+			case duckdb_parquet::Type::INT64:
 				read_buf->resize(reader.allocator, sizeof(int64_t) * (read_now - null_count));
 				dbp_decoder->GetBatch<int64_t>(read_buf->ptr, read_now - null_count);
 				break;
@@ -604,11 +611,11 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 			auto read_buf = make_shared_ptr<ResizeableBuffer>();
 
 			switch (schema.type) {
-			case duckdb_parquet::format::Type::FLOAT:
+			case duckdb_parquet::Type::FLOAT:
 				read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
 				bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
 				break;
-			case duckdb_parquet::format::Type::DOUBLE:
+			case duckdb_parquet::Type::DOUBLE:
 				read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
 				bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
 				break;
@@ -719,6 +726,7 @@ void StringColumnReader::PrepareDeltaLengthByteArray(ResizeableBuffer &buffer) {
 	auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
 	for (idx_t i = 0; i < value_count; i++) {
 		auto str_len = length_data[i];
+		buffer.available(str_len);
 		string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
 		auto result_data = string_data[i].GetDataWriteable();
 		memcpy(result_data, buffer.ptr, length_data[i]);
@@ -747,6 +755,7 @@ void StringColumnReader::PrepareDeltaByteArray(ResizeableBuffer &buffer) {
 	auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
 	for (idx_t i = 0; i < prefix_count; i++) {
 		auto str_len = prefix_data[i] + suffix_data[i];
+		buffer.available(suffix_data[i]);
 		string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
 		auto result_data = string_data[i].GetDataWriteable();
 		if (prefix_data[i] > 0) {
@@ -1334,7 +1343,7 @@ static unique_ptr<ColumnReader> CreateDecimalReaderInternal(ParquetReader &reade
 
 template <>
 double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t size,
-                                             const duckdb_parquet::format::SchemaElement &schema_ele) {
+                                             const duckdb_parquet::SchemaElement &schema_ele) {
 	double res = 0;
 	bool positive = (*pointer & 0x80) == 0;
 	for (idx_t i = 0; i < size; i += 8) {