From 1f22e768b1ea4bef20939a67049ae2435fad210c Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Wed, 13 Nov 2024 10:47:38 +0100 Subject: [PATCH] implement all wide variants --- benchmark/fivetran/fivetran.benchmark.in | 2 ++ benchmark/fivetran/q51.benchmark | 10 ++++++ benchmark/fivetran/q52.benchmark | 10 ++++++ benchmark/fivetran/q53.benchmark | 10 ++++++ benchmark/fivetran/q54.benchmark | 10 ++++++ benchmark/fivetran/q55.benchmark | 10 ++++++ benchmark/fivetran/q56.benchmark | 10 ++++++ benchmark/fivetran/q57.benchmark | 10 ++++++ benchmark/fivetran/q58.benchmark | 10 ++++++ benchmark/fivetran/q59.benchmark | 10 ++++++ benchmark/fivetran/q60.benchmark | 10 ++++++ benchmark/fivetran/q61.benchmark | 10 ++++++ benchmark/fivetran/q62.benchmark | 10 ++++++ benchmark/fivetran/queries/q10.sql | 2 +- benchmark/fivetran/queries/q14.sql | 2 +- benchmark/fivetran/queries/q51.sql | 28 +++++++++++++++++ benchmark/fivetran/queries/q52.sql | 29 ++++++++++++++++++ benchmark/fivetran/queries/q53.sql | 29 ++++++++++++++++++ benchmark/fivetran/queries/q54.sql | 30 ++++++++++++++++++ benchmark/fivetran/queries/q55.sql | 37 ++++++++++++++++++++++ benchmark/fivetran/queries/q56.sql | 38 +++++++++++++++++++++++ benchmark/fivetran/queries/q57.sql | 38 +++++++++++++++++++++++ benchmark/fivetran/queries/q58.sql | 39 ++++++++++++++++++++++++ benchmark/fivetran/queries/q59.sql | 18 +++++++++++ benchmark/fivetran/queries/q60.sql | 19 ++++++++++++ benchmark/fivetran/queries/q61.sql | 19 ++++++++++++ benchmark/fivetran/queries/q62.sql | 20 ++++++++++++ 27 files changed, 468 insertions(+), 2 deletions(-) create mode 100644 benchmark/fivetran/q51.benchmark create mode 100644 benchmark/fivetran/q52.benchmark create mode 100644 benchmark/fivetran/q53.benchmark create mode 100644 benchmark/fivetran/q54.benchmark create mode 100644 benchmark/fivetran/q55.benchmark create mode 100644 benchmark/fivetran/q56.benchmark create mode 100644 benchmark/fivetran/q57.benchmark create mode 100644 benchmark/fivetran/q58.benchmark create mode 100644 benchmark/fivetran/q59.benchmark create mode 100644 benchmark/fivetran/q60.benchmark create mode 100644 benchmark/fivetran/q61.benchmark create mode 100644 benchmark/fivetran/q62.benchmark create mode 100644 benchmark/fivetran/queries/q51.sql create mode 100644 benchmark/fivetran/queries/q52.sql create mode 100644 benchmark/fivetran/queries/q53.sql create mode 100644 benchmark/fivetran/queries/q54.sql create mode 100644 benchmark/fivetran/queries/q55.sql create mode 100644 benchmark/fivetran/queries/q56.sql create mode 100644 benchmark/fivetran/queries/q57.sql create mode 100644 benchmark/fivetran/queries/q58.sql create mode 100644 benchmark/fivetran/queries/q59.sql create mode 100644 benchmark/fivetran/queries/q60.sql create mode 100644 benchmark/fivetran/queries/q61.sql create mode 100644 benchmark/fivetran/queries/q62.sql diff --git a/benchmark/fivetran/fivetran.benchmark.in b/benchmark/fivetran/fivetran.benchmark.in index 7d76666e284..827d4d92aeb 100644 --- a/benchmark/fivetran/fivetran.benchmark.in +++ b/benchmark/fivetran/fivetran.benchmark.in @@ -30,6 +30,8 @@ cache fivetran.duckdb no_connect # we also create an empty data/update file that doesn't match anything (try to mess with parquet cardinality estimation) # after generating thin variant, repeat entire data generation for wide variant # for the wide variant we modulo 4 to reduce the size of the data (so the number of parquet files is similar to thin) +# for the wide variant queries we number the equivalent queries with number +50 +# this makes it so the thin/wide queries aren't interleaved, and makes it easy to compare load benchmark/fivetran/init/load.sql diff --git a/benchmark/fivetran/q51.benchmark b/benchmark/fivetran/q51.benchmark new file mode 100644 index 00000000000..a9b7530639c --- /dev/null +++ b/benchmark/fivetran/q51.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q51.benchmark +# description: Run query 51 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=51 +QUERY_NUMBER_PADDED=51 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q52.benchmark b/benchmark/fivetran/q52.benchmark new file mode 100644 index 00000000000..b7d7d4970b0 --- /dev/null +++ b/benchmark/fivetran/q52.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q52.benchmark +# description: Run query 52 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=52 +QUERY_NUMBER_PADDED=52 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q53.benchmark b/benchmark/fivetran/q53.benchmark new file mode 100644 index 00000000000..7c37309ca31 --- /dev/null +++ b/benchmark/fivetran/q53.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q53.benchmark +# description: Run query 53 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=53 +QUERY_NUMBER_PADDED=53 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q54.benchmark b/benchmark/fivetran/q54.benchmark new file mode 100644 index 00000000000..5577fd37b0f --- /dev/null +++ b/benchmark/fivetran/q54.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q54.benchmark +# description: Run query 54 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=54 +QUERY_NUMBER_PADDED=54 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') - (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q55.benchmark b/benchmark/fivetran/q55.benchmark new file mode 100644 index 00000000000..7621fc498f0 --- /dev/null +++ b/benchmark/fivetran/q55.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q55.benchmark +# description: Run query 55 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=55 +QUERY_NUMBER_PADDED=55 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q56.benchmark b/benchmark/fivetran/q56.benchmark new file mode 100644 index 00000000000..1fe6d8cd25c --- /dev/null +++ b/benchmark/fivetran/q56.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q56.benchmark +# description: Run query 56 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=56 +QUERY_NUMBER_PADDED=56 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q57.benchmark b/benchmark/fivetran/q57.benchmark new file mode 100644 index 00000000000..cc972ccfb73 --- /dev/null +++ b/benchmark/fivetran/q57.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q57.benchmark +# description: Run query 57 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=57 +QUERY_NUMBER_PADDED=57 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q58.benchmark b/benchmark/fivetran/q58.benchmark new file mode 100644 index 00000000000..c49a3b2d583 --- /dev/null +++ b/benchmark/fivetran/q58.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q58.benchmark +# description: Run query 58 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=58 +QUERY_NUMBER_PADDED=58 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/output/*.parquet') IS NOT DISTINCT FROM (SELECT count(DISTINCT string_pk) FROM 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet') +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q59.benchmark b/benchmark/fivetran/q59.benchmark new file mode 100644 index 00000000000..1d95c159beb --- /dev/null +++ b/benchmark/fivetran/q59.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q59.benchmark +# description: Run query 59 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=59 +QUERY_NUMBER_PADDED=59 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5 +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q60.benchmark b/benchmark/fivetran/q60.benchmark new file mode 100644 index 00000000000..6e645714229 --- /dev/null +++ b/benchmark/fivetran/q60.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q60.benchmark +# description: Run query 60 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=60 +QUERY_NUMBER_PADDED=60 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5 +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q61.benchmark b/benchmark/fivetran/q61.benchmark new file mode 100644 index 00000000000..1544913215b --- /dev/null +++ b/benchmark/fivetran/q61.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q61.benchmark +# description: Run query 61 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=61 +QUERY_NUMBER_PADDED=61 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5 +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/q62.benchmark b/benchmark/fivetran/q62.benchmark new file mode 100644 index 00000000000..2567ac330ca --- /dev/null +++ b/benchmark/fivetran/q62.benchmark @@ -0,0 +1,10 @@ +# name: benchmark/fivetran/q62.benchmark +# description: Run query 62 from the Fivetran benchmarks +# group: [fivetran] + +template benchmark/fivetran/fivetran.benchmark.in +QUERY_NUMBER=62 +QUERY_NUMBER_PADDED=62 +RESULT_COLUMNS=I +RESULT_QUERY=SELECT (SELECT count(*) FROM __answer)/(SELECT count(*) FROM glob('duckdb_benchmark_data/fivetran/wide/existing/*.parquet')) BETWEEN 0.3 AND 0.5 +RESULT_ANSWER=1 diff --git a/benchmark/fivetran/queries/q10.sql b/benchmark/fivetran/queries/q10.sql index ef5005e8118..b896966bd47 100644 --- a/benchmark/fivetran/queries/q10.sql +++ b/benchmark/fivetran/queries/q10.sql @@ -3,7 +3,7 @@ SELECT DISTINCT("_fivetran_filename") FROM read_parquet([ - 'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet', 'duckdb_benchmark_data/fivetran/thin/existing/*.parquet' ], filename=_fivetran_filename) AS existing WHERE EXISTS ( diff --git a/benchmark/fivetran/queries/q14.sql b/benchmark/fivetran/queries/q14.sql index a3a641ce912..151e97b305a 100644 --- a/benchmark/fivetran/queries/q14.sql +++ b/benchmark/fivetran/queries/q14.sql @@ -3,7 +3,7 @@ SELECT DISTINCT("_fivetran_filename") FROM read_parquet([ - 'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/thin/tiny_data.parquet', 'duckdb_benchmark_data/fivetran/thin/existing/*.parquet' ], filename=_fivetran_filename) AS existing WHERE EXISTS ( diff --git a/benchmark/fivetran/queries/q51.sql b/benchmark/fivetran/queries/q51.sql new file mode 100644 index 00000000000..82f6431c30b --- /dev/null +++ b/benchmark/fivetran/queries/q51.sql @@ -0,0 +1,28 @@ +-- wide delete scenario with string + instant pk +COPY ( + SELECT + * + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS existing + WHERE NOT EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + ) +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q52.sql b/benchmark/fivetran/queries/q52.sql new file mode 100644 index 00000000000..6c185c97b14 --- /dev/null +++ b/benchmark/fivetran/queries/q52.sql @@ -0,0 +1,29 @@ +-- wide delete scenario with string + instant pk (with tiny existing file) +COPY ( + SELECT + * + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet', + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet' + ]) AS existing + WHERE NOT EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + ) +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q53.sql b/benchmark/fivetran/queries/q53.sql new file mode 100644 index 00000000000..bb2ec0223fa --- /dev/null +++ b/benchmark/fivetran/queries/q53.sql @@ -0,0 +1,29 @@ +-- wide delete scenario with string + instant pk (with tiny incoming file) +COPY ( + SELECT + * + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS existing + WHERE NOT EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + ) +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q54.sql b/benchmark/fivetran/queries/q54.sql new file mode 100644 index 00000000000..6d376737df4 --- /dev/null +++ b/benchmark/fivetran/queries/q54.sql @@ -0,0 +1,30 @@ +-- wide delete scenario with string + instant pk (with tiny existing and incoming file) +COPY ( + SELECT + * + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS existing + WHERE NOT EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + ) +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q55.sql b/benchmark/fivetran/queries/q55.sql new file mode 100644 index 00000000000..40168ca051a --- /dev/null +++ b/benchmark/fivetran/queries/q55.sql @@ -0,0 +1,37 @@ +-- wide update scenario with string + instant pk +COPY ( + SELECT + "existing".* REPLACE ( + update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0", + update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1", + update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10", + update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11", + update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20", + update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21", + update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30", + update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31", + update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40", + update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41", + CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced", + ) + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS "existing" + LEFT JOIN + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/update/*.parquet' + ]) AS "staging" + ON + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + AND "existing"."_fivetran_start" = "staging"."_fivetran_start" +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q56.sql b/benchmark/fivetran/queries/q56.sql new file mode 100644 index 00000000000..b3951009d9e --- /dev/null +++ b/benchmark/fivetran/queries/q56.sql @@ -0,0 +1,38 @@ +-- wide update scenario with string + instant pk (with tiny existing file) +COPY ( + SELECT + "existing".* REPLACE ( + update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0", + update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1", + update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10", + update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11", + update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20", + update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21", + update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30", + update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31", + update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40", + update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41", + CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced", + ) + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS "existing" + LEFT JOIN + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/update/*.parquet' + ]) AS "staging" + ON + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + AND "existing"."_fivetran_start" = "staging"."_fivetran_start" +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q57.sql b/benchmark/fivetran/queries/q57.sql new file mode 100644 index 00000000000..1126fc5a912 --- /dev/null +++ b/benchmark/fivetran/queries/q57.sql @@ -0,0 +1,38 @@ +-- wide update scenario with string + instant pk (with tiny incoming file) +COPY ( + SELECT + "existing".* REPLACE ( + update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0", + update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1", + update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10", + update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11", + update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20", + update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21", + update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30", + update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31", + update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40", + update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41", + CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced", + ) + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS "existing" + LEFT JOIN + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_update.parquet', + 'duckdb_benchmark_data/fivetran/wide/update/*.parquet' + ]) AS "staging" + ON + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + AND "existing"."_fivetran_start" = "staging"."_fivetran_start" +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q58.sql b/benchmark/fivetran/queries/q58.sql new file mode 100644 index 00000000000..0b2a9201959 --- /dev/null +++ b/benchmark/fivetran/queries/q58.sql @@ -0,0 +1,39 @@ +-- wide update scenario with string + instant pk (with tiny existing and incoming file) +COPY ( + SELECT + "existing".* REPLACE ( + update_macro("existing"."string_0", "staging"."string_0", 1, "_fivetran_updated_cols") AS "string_0", + update_macro("existing"."string_1", "staging"."string_1", 2, "_fivetran_updated_cols") AS "string_1", + update_macro("existing"."string_10", "staging"."string_10", 3, "_fivetran_updated_cols") AS "string_10", + update_macro("existing"."string_11", "staging"."string_11", 3, "_fivetran_updated_cols") AS "string_11", + update_macro("existing"."string_20", "staging"."string_20", 4, "_fivetran_updated_cols") AS "string_20", + update_macro("existing"."string_21", "staging"."string_21", 5, "_fivetran_updated_cols") AS "string_21", + update_macro("existing"."string_30", "staging"."string_30", 6, "_fivetran_updated_cols") AS "string_30", + update_macro("existing"."string_31", "staging"."string_31", 7, "_fivetran_updated_cols") AS "string_31", + update_macro("existing"."string_40", "staging"."string_40", 7, "_fivetran_updated_cols") AS "string_40", + update_macro("existing"."string_41", "staging"."string_41", 8, "_fivetran_updated_cols") AS "string_41", + CASE WHEN "staging"."_fivetran_synced" IS NULL THEN "existing"."_fivetran_synced" ELSE "staging"."_fivetran_synced" END AS "_fivetran_synced", + ) + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ]) AS "existing" + LEFT JOIN + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_update.parquet', + 'duckdb_benchmark_data/fivetran/wide/update/*.parquet' + ]) AS "staging" + ON + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" + AND "existing"."_fivetran_start" = "staging"."_fivetran_start" +) TO 'duckdb_benchmark_data/fivetran/output' ( + FORMAT PARQUET, + COMPRESSION ZSTD, + COMPRESSION_LEVEL 1, + PER_THREAD_OUTPUT TRUE, + ROW_GROUP_SIZE_BYTES '64 MB', + ROW_GROUPS_PER_FILE 1, + OVERWRITE TRUE +); diff --git a/benchmark/fivetran/queries/q59.sql b/benchmark/fivetran/queries/q59.sql new file mode 100644 index 00000000000..afe340dcba4 --- /dev/null +++ b/benchmark/fivetran/queries/q59.sql @@ -0,0 +1,18 @@ +-- wide recon scenario with string + instant pk +SELECT + DISTINCT("_fivetran_filename") +FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ], filename=_fivetran_filename) AS existing +WHERE EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" +); diff --git a/benchmark/fivetran/queries/q60.sql b/benchmark/fivetran/queries/q60.sql new file mode 100644 index 00000000000..13ce46d6aa2 --- /dev/null +++ b/benchmark/fivetran/queries/q60.sql @@ -0,0 +1,19 @@ +-- wide recon scenario with string + instant pk (with tiny existing file) +SELECT + DISTINCT("_fivetran_filename") +FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ], filename=_fivetran_filename) AS existing +WHERE EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" +); diff --git a/benchmark/fivetran/queries/q61.sql b/benchmark/fivetran/queries/q61.sql new file mode 100644 index 00000000000..32c8ad102f8 --- /dev/null +++ b/benchmark/fivetran/queries/q61.sql @@ -0,0 +1,19 @@ +-- wide recon scenario with string + instant pk (with tiny incoming file) +SELECT + DISTINCT("_fivetran_filename") +FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/thin/existing/*.parquet' + ], filename=_fivetran_filename) AS existing +WHERE EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" +); diff --git a/benchmark/fivetran/queries/q62.sql b/benchmark/fivetran/queries/q62.sql new file mode 100644 index 00000000000..511fd2683ff --- /dev/null +++ b/benchmark/fivetran/queries/q62.sql @@ -0,0 +1,20 @@ +-- wide recon scenario with string + instant pk (with tiny existing and incoming file) +SELECT + DISTINCT("_fivetran_filename") +FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/existing/*.parquet' + ], filename=_fivetran_filename) AS existing +WHERE EXISTS ( + SELECT + TRUE + FROM + read_parquet([ + 'duckdb_benchmark_data/fivetran/wide/tiny_data.parquet', + 'duckdb_benchmark_data/fivetran/wide/incoming/*.parquet' + ]) AS staging + WHERE + "existing"."string_pk" = "staging"."string_pk" + AND "existing"."instant_id" = "staging"."instant_id" +);