Skip to content

Commit

Permalink
implement all wide variants
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Nov 13, 2024
1 parent a900b64 commit 1f22e76
Show file tree
Hide file tree
Showing 27 changed files with 468 additions and 2 deletions.
2 changes: 2 additions & 0 deletions benchmark/fivetran/fivetran.benchmark.in
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ cache fivetran.duckdb no_connect
# we also create an empty data/update file that doesn't match anything (try to mess with parquet cardinality estimation)
# after generating thin variant, repeat entire data generation for wide variant
# for the wide variant we modulo 4 to reduce the size of the data (so the number of parquet files is similar to thin)
# for the wide variant queries we number the equivalent queries with number +50
# this makes it so the thin/wide queries aren't interleaved, and makes it easy to compare

load benchmark/fivetran/init/load.sql

Expand Down
10 changes: 10 additions & 0 deletions benchmark/fivetran/q51.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q51.benchmark
# description: Run query 51 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=51
QUERY_NUMBER_PADDED=51
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q52.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q52.benchmark
# description: Run query 52 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=52
QUERY_NUMBER_PADDED=52
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q53.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q53.benchmark
# description: Run query 53 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=53
QUERY_NUMBER_PADDED=53
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q54.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q54.benchmark
# description: Run query 54 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=54
QUERY_NUMBER_PADDED=54
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q55.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q55.benchmark
# description: Run query 55 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=55
QUERY_NUMBER_PADDED=55
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q56.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q56.benchmark
# description: Run query 56 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=56
QUERY_NUMBER_PADDED=56
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q57.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q57.benchmark
# description: Run query 57 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=57
QUERY_NUMBER_PADDED=57
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q58.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q58.benchmark
# description: Run query 58 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=58
QUERY_NUMBER_PADDED=58
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet')
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q59.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q59.benchmark
# description: Run query 59 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=59
QUERY_NUMBER_PADDED=59
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q60.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q60.benchmark
# description: Run query 60 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=60
QUERY_NUMBER_PADDED=60
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q61.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q61.benchmark
# description: Run query 61 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=61
QUERY_NUMBER_PADDED=61
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5
RESULT_ANSWER=1
10 changes: 10 additions & 0 deletions benchmark/fivetran/q62.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# name: benchmark/fivetran/q62.benchmark
# description: Run query 62 from the Fivetran benchmarks
# group: [fivetran]

template benchmark/fivetran/fivetran.benchmark.in
QUERY_NUMBER=62
QUERY_NUMBER_PADDED=62
RESULT_COLUMNS=I
RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5
RESULT_ANSWER=1
2 changes: 1 addition & 1 deletion benchmark/fivetran/queries/q10.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SELECT
DISTINCT("_fivetran_filename")
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/thin/existing/*.parquet'
], filename=_fivetran_filename) AS existing
WHERE EXISTS (
Expand Down
2 changes: 1 addition & 1 deletion benchmark/fivetran/queries/q14.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SELECT
DISTINCT("_fivetran_filename")
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/thin/existing/*.parquet'
], filename=_fivetran_filename) AS existing
WHERE EXISTS (
Expand Down
28 changes: 28 additions & 0 deletions benchmark/fivetran/queries/q51.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- wide delete scenario with string + instant pk
COPY (
SELECT
*
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS existing
WHERE NOT EXISTS (
SELECT
TRUE
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet'
]) AS staging
WHERE
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
)
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
29 changes: 29 additions & 0 deletions benchmark/fivetran/queries/q52.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
-- wide delete scenario with string + instant pk (with tiny existing file)
COPY (
SELECT
*
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet',
'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet'
]) AS existing
WHERE NOT EXISTS (
SELECT
TRUE
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet'
]) AS staging
WHERE
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
)
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
29 changes: 29 additions & 0 deletions benchmark/fivetran/queries/q53.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
-- wide delete scenario with string + instant pk (with tiny incoming file)
COPY (
SELECT
*
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS existing
WHERE NOT EXISTS (
SELECT
TRUE
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet'
]) AS staging
WHERE
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
)
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
30 changes: 30 additions & 0 deletions benchmark/fivetran/queries/q54.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- wide delete scenario with string + instant pk (with tiny existing and incoming file)
COPY (
SELECT
*
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS existing
WHERE NOT EXISTS (
SELECT
TRUE
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet'
]) AS staging
WHERE
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
)
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
37 changes: 37 additions & 0 deletions benchmark/fivetran/queries/q55.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- wide update scenario with string + instant pk
COPY (
SELECT
"existing".* REPLACE (
update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0",
update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1",
update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10",
update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11",
update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20",
update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21",
update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30",
update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31",
update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40",
update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41",
CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced",
)
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS "existing"
LEFT JOIN
read_parquet([
'duckdb_benchmark_data/fivetran/wide/update/*.parquet'
]) AS "staging"
ON
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
AND "existing"."_fivetran_start" = "staging"."_fivetran_start"
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
38 changes: 38 additions & 0 deletions benchmark/fivetran/queries/q56.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
-- wide update scenario with string + instant pk (with tiny existing file)
COPY (
SELECT
"existing".* REPLACE (
update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0",
update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1",
update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10",
update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11",
update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20",
update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21",
update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30",
update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31",
update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40",
update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41",
CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced",
)
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet',
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS "existing"
LEFT JOIN
read_parquet([
'duckdb_benchmark_data/fivetran/wide/update/*.parquet'
]) AS "staging"
ON
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
AND "existing"."_fivetran_start" = "staging"."_fivetran_start"
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
38 changes: 38 additions & 0 deletions benchmark/fivetran/queries/q57.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
-- wide update scenario with string + instant pk (with tiny incoming file)
COPY (
SELECT
"existing".* REPLACE (
update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0",
update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1",
update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10",
update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11",
update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20",
update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21",
update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30",
update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31",
update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40",
update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41",
CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced",
)
FROM
read_parquet([
'duckdb_benchmark_data/fivetran/wide/existing/*.parquet'
]) AS "existing"
LEFT JOIN
read_parquet([
'duckdb_benchmark_data/fivetran/wide/tiny_update.parquet',
'duckdb_benchmark_data/fivetran/wide/update/*.parquet'
]) AS "staging"
ON
"existing"."string_pk" = "staging"."string_pk"
AND "existing"."instant_id" = "staging"."instant_id"
AND "existing"."_fivetran_start" = "staging"."_fivetran_start"
) TO 'duckdb_benchmark_data/fivetran/output' (
FORMAT PARQUET,
COMPRESSION ZSTD,
COMPRESSION_LEVEL 1,
PER_THREAD_OUTPUT TRUE,
ROW_GROUP_SIZE_BYTES '64 MB',
ROW_GROUPS_PER_FILE 1,
OVERWRITE TRUE
);
Loading

0 comments on commit 1f22e76

Please sign in to comment.