Skip to content

Commit

Permalink
merge with feature and set proper lowest zstd level
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Oct 18, 2024
2 parents ffefbe0 + 1979504 commit 36885cc
Show file tree
Hide file tree
Showing 204 changed files with 36,459 additions and 12,231 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/CodeQuality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ jobs:
run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build clang-tidy && sudo pip3 install pybind11[global] --break-system-packages

- name: Setup Ccache
if: ${{ github.ref == 'refs/heads/main' }}
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
uses: hendrikmuhs/ccache-action@main
with:
key: ${{ github.job }}
save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }}

- name: Download clang-tidy-cache
if: ${{ github.ref == 'refs/heads/main' }}
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
shell: bash
run: |
set -e
Expand All @@ -134,10 +134,10 @@ jobs:
- name: Tidy Check
shell: bash
if: ${{ github.ref == 'refs/heads/main' }}
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feature' }}
run: make tidy-check TIDY_BINARY=/tmp/clang-tidy-cache

- name: Tidy Check Diff
shell: bash
if: ${{ github.ref != 'refs/heads/main' }}
if: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/feature' }}
run: make tidy-check-diff
40 changes: 0 additions & 40 deletions .github/workflows/Regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,46 +222,6 @@ jobs:
echo "Storage bump detected, all good!"
fi
regression-test-binary-size:
name: Regression test binary size
runs-on: ubuntu-20.04
env:
CC: gcc-10
CXX: g++-10
GEN: ninja
BUILD_TPCH: 1
BUILD_TPCDS: 1
BUILD_JSON: 1
BUILD_PARQUET: 1
EXTENSION_STATIC_BUILD: 1
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install
shell: bash
run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build && pip install requests

- name: Setup Ccache
uses: hendrikmuhs/ccache-action@main
with:
key: ${{ github.job }}
save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }}

- name: Build
shell: bash
run: |
make
git clone --branch ${{ env.BASE_BRANCH }} https://github.com/duckdb/duckdb.git --depth=1
cd duckdb
make
cd ..
regression-test-binary-size:
name: Regression test binary size
runs-on: ubuntu-20.04
Expand Down
2 changes: 1 addition & 1 deletion extension/core_functions/core_functions_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace duckdb {

template <class T>
void FillExtraInfo(const StaticFunctionDefinition &function, T &info) {
static void FillExtraInfo(const StaticFunctionDefinition &function, T &info) {
info.internal = true;
info.description = function.description;
info.parameter_names = StringUtil::Split(function.parameters, ",");
Expand Down
4 changes: 2 additions & 2 deletions extension/core_functions/function_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace duckdb {
{ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }

// this list is generated by scripts/generate_functions.py
static const StaticFunctionDefinition internal_functions[] = {
static const StaticFunctionDefinition core_functions[] = {
DUCKDB_SCALAR_FUNCTION(FactorialOperatorFun),
DUCKDB_SCALAR_FUNCTION_SET(BitwiseAndFun),
DUCKDB_SCALAR_FUNCTION_ALIAS(ListHasAnyFunAlias),
Expand Down Expand Up @@ -400,7 +400,7 @@ static const StaticFunctionDefinition internal_functions[] = {
};

const StaticFunctionDefinition *StaticFunctionDefinition::GetFunctionList() {
return internal_functions;
return core_functions;
}

} // namespace duckdb
2 changes: 1 addition & 1 deletion extension/core_functions/scalar/list/array_slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static idx_t CalculateSliceLength(idx_t begin, idx_t end, INDEX_TYPE step, bool

struct BlobSliceOperations {
static int64_t ValueLength(const string_t &value) {
return value.GetSize();
return UnsafeNumericCast<int64_t>(value.GetSize());
}

static string_t SliceValue(Vector &result, string_t input, int64_t begin, int64_t end) {
Expand Down
1 change: 0 additions & 1 deletion extension/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ if(NOT CLANG_TIDY)
# parquet/thrift/snappy
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES}
../../third_party/parquet/parquet_constants.cpp
../../third_party/parquet/parquet_types.cpp
../../third_party/thrift/thrift/protocol/TProtocol.cpp
../../third_party/thrift/thrift/transport/TTransportException.cpp
Expand Down
31 changes: 20 additions & 11 deletions extension/parquet/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@

namespace duckdb {

using duckdb_parquet::format::CompressionCodec;
using duckdb_parquet::format::ConvertedType;
using duckdb_parquet::format::Encoding;
using duckdb_parquet::format::PageType;
using duckdb_parquet::format::Type;
using duckdb_parquet::CompressionCodec;
using duckdb_parquet::ConvertedType;
using duckdb_parquet::Encoding;
using duckdb_parquet::PageType;
using duckdb_parquet::Type;

const uint64_t ParquetDecodeUtils::BITPACK_MASKS[] = {0,
1,
Expand Down Expand Up @@ -237,6 +237,10 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
block.reset();
PageHeader page_hdr;
reader.Read(page_hdr, *protocol);
// some basic sanity check
if (page_hdr.compressed_page_size < 0 || page_hdr.uncompressed_page_size < 0) {
throw std::runtime_error("Page sizes can't be < 0");
}

switch (page_hdr.type) {
case PageType::DATA_PAGE_V2:
Expand Down Expand Up @@ -277,7 +281,6 @@ void ColumnReader::ResetPage() {

void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
D_ASSERT(page_hdr.type == PageType::DATA_PAGE_V2);

auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());

AllocateBlock(page_hdr.uncompressed_page_size + 1);
Expand All @@ -299,6 +302,10 @@ void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
// copy repeats & defines as-is because FOR SOME REASON they are uncompressed
auto uncompressed_bytes = page_hdr.data_page_header_v2.repetition_levels_byte_length +
page_hdr.data_page_header_v2.definition_levels_byte_length;
if (uncompressed_bytes > page_hdr.uncompressed_page_size) {
throw std::runtime_error("Page header inconsistency, uncompressed_page_size needs to be larger than "
"repetition_levels_byte_length + definition_levels_byte_length");
}
trans.read(block->ptr, uncompressed_bytes);

auto compressed_bytes = page_hdr.compressed_page_size - uncompressed_bytes;
Expand Down Expand Up @@ -574,12 +581,12 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
auto read_buf = make_shared_ptr<ResizeableBuffer>();

switch (schema.type) {
case duckdb_parquet::format::Type::INT32:
case duckdb_parquet::Type::INT32:
read_buf->resize(reader.allocator, sizeof(int32_t) * (read_now - null_count));
dbp_decoder->GetBatch<int32_t>(read_buf->ptr, read_now - null_count);

break;
case duckdb_parquet::format::Type::INT64:
case duckdb_parquet::Type::INT64:
read_buf->resize(reader.allocator, sizeof(int64_t) * (read_now - null_count));
dbp_decoder->GetBatch<int64_t>(read_buf->ptr, read_now - null_count);
break;
Expand All @@ -604,11 +611,11 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
auto read_buf = make_shared_ptr<ResizeableBuffer>();

switch (schema.type) {
case duckdb_parquet::format::Type::FLOAT:
case duckdb_parquet::Type::FLOAT:
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
break;
case duckdb_parquet::format::Type::DOUBLE:
case duckdb_parquet::Type::DOUBLE:
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
break;
Expand Down Expand Up @@ -719,6 +726,7 @@ void StringColumnReader::PrepareDeltaLengthByteArray(ResizeableBuffer &buffer) {
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
for (idx_t i = 0; i < value_count; i++) {
auto str_len = length_data[i];
buffer.available(str_len);
string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
auto result_data = string_data[i].GetDataWriteable();
memcpy(result_data, buffer.ptr, length_data[i]);
Expand Down Expand Up @@ -747,6 +755,7 @@ void StringColumnReader::PrepareDeltaByteArray(ResizeableBuffer &buffer) {
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
for (idx_t i = 0; i < prefix_count; i++) {
auto str_len = prefix_data[i] + suffix_data[i];
buffer.available(suffix_data[i]);
string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
auto result_data = string_data[i].GetDataWriteable();
if (prefix_data[i] > 0) {
Expand Down Expand Up @@ -1334,7 +1343,7 @@ static unique_ptr<ColumnReader> CreateDecimalReaderInternal(ParquetReader &reade

template <>
double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t size,
const duckdb_parquet::format::SchemaElement &schema_ele) {
const duckdb_parquet::SchemaElement &schema_ele) {
double res = 0;
bool positive = (*pointer & 0x80) == 0;
for (idx_t i = 0; i < size; i += 8) {
Expand Down
Loading

0 comments on commit 36885cc

Please sign in to comment.