From 2e23b5f3aa4301357766d8b2ba6fd2855a496f35 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Thu, 25 Jul 2019 11:37:25 +0530 Subject: [PATCH 01/14] Updated readme.md Corrected typo in the name of the folder that contains TPCH datagen resources. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71b2514..4c0abcf 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen ``` 2. Upload the resources to DFS. ```shell - hdfs dfs -copyFromLocal resoruces /tmp + hdfs dfs -copyFromLocal resources /tmp ``` 3. Run TPCHDataGen.hql with settings.hql file and set the required config variables. From 0dd08dfc00aa6075bcb7582be573f691b8c18965 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:05:38 +0530 Subject: [PATCH 02/14] Update README.md * Updated repo URL to clone in README.md * Added note on file permissions for HDI 4.0 * Added note on URL for ADLS --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4c0abcf..a75f612 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen 1. Clone this repo. ```shell - git clone https://github.com/dharmeshkakadia/tpch-datagen-as-hive-query/ && cd tpch-datagen-as-hive-query + git clone https://github.com/cruizen/tpch-hdinsight.git && cd tpch-hdinsight ``` 2. Run TPCHDataGen.hql with settings.hql file and set the required config variables. ```shell @@ -24,6 +24,11 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen ```shell hive -i settings.hql -f ddl/createAllExternalTables.hql -hiveconf LOCATION=/HiveTPCH/ -hiveconf DBNAME=tpch ``` + For HDI 4.0, allow permissions to other users on the storage by running + ```shell + hdfs dfs -chmod -R 777 /HiveTPCH + ``` + Generate ORC tables and analyze ```shell hive -i settings.hql -f ddl/createAllORCTables.hql -hiveconf ORCDBNAME=tpch_orc -hiveconf SOURCE=tpch @@ -39,7 +44,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen 1. Clone this repo. ```shell - git clone https://github.com/dharmeshkakadia/tpch-datagen-as-hive-query/ && cd tpch-datagen-as-hive-query + git clone https://github.com/cruizen/tpch-hdinsight.git && cd tpch-hdinsight ``` 2. Upload the resources to DFS. ```shell @@ -54,6 +59,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen `PARTS` is a number of task to use for datagen (parrellelization), `LOCATION` is the directory where the data will be stored on HDFS, `TPCHBIN` is where the resources are uploaded on step 2. You can specify specific settings in settings.hql file. + When ADLS is used as the storage instead of Azure blob storage, replace wasb in the URL for fs.defaultFS with abfs since ADLS uses the abfs:// storage scheme. 4. Now you can create tables on the generated data. ```shell From 85f1ef919d5175d0e528ec638cbbb912c9293053 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 16 Sep 2019 18:38:09 +0000 Subject: [PATCH 03/14] Comment out the CREATE TABLE resultsTable sentence in query definition files --- queries/tpch_query1.hql | 2 +- queries/tpch_query10.hql | 2 +- queries/tpch_query11.hql | 2 +- queries/tpch_query12.hql | 2 +- queries/tpch_query13.hql | 2 +- queries/tpch_query14.hql | 2 +- queries/tpch_query15.hql | 2 +- queries/tpch_query16.hql | 2 +- queries/tpch_query17.hql | 2 +- queries/tpch_query19.hql | 2 +- queries/tpch_query2.hql | 2 +- queries/tpch_query20.hql | 2 +- queries/tpch_query21.hql | 2 +- queries/tpch_query22.hql | 2 +- queries/tpch_query3.hql | 2 +- queries/tpch_query4.hql | 2 +- queries/tpch_query5.hql | 2 +- queries/tpch_query6.hql | 2 +- queries/tpch_query7.hql | 2 +- queries/tpch_query8.hql | 2 +- queries/tpch_query9.hql | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/queries/tpch_query1.hql b/queries/tpch_query1.hql index 49ee202..8cc8dd0 100644 --- a/queries/tpch_query1.hql +++ b/queries/tpch_query1.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query1_result AS +--CREATE TABLE tpch_query1_result AS SELECT l_returnflag ,l_linestatus diff --git a/queries/tpch_query10.hql b/queries/tpch_query10.hql index e6bcaaf..1dcee11 100644 --- a/queries/tpch_query10.hql +++ b/queries/tpch_query10.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query10_result AS +--CREATE TABLE tpch_query10_result AS SELECT c_custkey ,c_name diff --git a/queries/tpch_query11.hql b/queries/tpch_query11.hql index cdfe2b1..cd4d9bb 100644 --- a/queries/tpch_query11.hql +++ b/queries/tpch_query11.hql @@ -19,7 +19,7 @@ AS SELECT sum(part_value) AS total_value FROM q11_part_tmp_cached; -CREATE TABLE tpch_query11_result AS +--CREATE TABLE tpch_query11_result AS SELECT ps_partkey ,part_value AS value diff --git a/queries/tpch_query12.hql b/queries/tpch_query12.hql index d4b4bb8..e0621d1 100644 --- a/queries/tpch_query12.hql +++ b/queries/tpch_query12.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query12_result AS +--CREATE TABLE tpch_query12_result AS SELECT l_shipmode ,sum(CASE diff --git a/queries/tpch_query13.hql b/queries/tpch_query13.hql index 0ec2997..a9bbd1e 100644 --- a/queries/tpch_query13.hql +++ b/queries/tpch_query13.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query13_result AS +--CREATE TABLE tpch_query13_result AS SELECT c_count ,count(*) AS custdist diff --git a/queries/tpch_query14.hql b/queries/tpch_query14.hql index 3721ef2..82c3a1d 100644 --- a/queries/tpch_query14.hql +++ b/queries/tpch_query14.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query14_result AS +--CREATE TABLE tpch_query14_result AS SELECT 100.00 * sum(CASE WHEN p_type LIKE 'PROMO%' diff --git a/queries/tpch_query15.hql b/queries/tpch_query15.hql index aedec4f..9507e28 100644 --- a/queries/tpch_query15.hql +++ b/queries/tpch_query15.hql @@ -16,7 +16,7 @@ AS SELECT max(total_revenue) AS max_revenue FROM revenue_cached; -CREATE TABLE tpch_query15_result AS +--CREATE TABLE tpch_query15_result AS SELECT s_suppkey ,s_name diff --git a/queries/tpch_query16.hql b/queries/tpch_query16.hql index e608990..0de6988 100644 --- a/queries/tpch_query16.hql +++ b/queries/tpch_query16.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query16_result AS +--CREATE TABLE tpch_query16_result AS SELECT p_brand ,p_type diff --git a/queries/tpch_query17.hql b/queries/tpch_query17.hql index bb9462b..5a7c7f5 100644 --- a/queries/tpch_query17.hql +++ b/queries/tpch_query17.hql @@ -7,7 +7,7 @@ SELECT l_partkey AS t_partkey FROM lineitem GROUP BY l_partkey; -CREATE TABLE tpch_query17_result AS +--CREATE TABLE tpch_query17_result AS SELECT sum(l_extendedprice) / 7.0 AS avg_yearly FROM ( diff --git a/queries/tpch_query19.hql b/queries/tpch_query19.hql index 702de3c..c75f99d 100644 --- a/queries/tpch_query19.hql +++ b/queries/tpch_query19.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query19_result AS +--CREATE TABLE tpch_query19_result AS SELECT sum(l_extendedprice * (1 - l_discount)) AS revenue FROM lineitem diff --git a/queries/tpch_query2.hql b/queries/tpch_query2.hql index 9f5a7dc..8dc4b2f 100644 --- a/queries/tpch_query2.hql +++ b/queries/tpch_query2.hql @@ -16,7 +16,7 @@ WHERE p_partkey = ps_partkey AND r_name = 'EUROPE' GROUP BY p_partkey; -CREATE TABLE tpch_query2_result AS +--CREATE TABLE tpch_query2_result AS SELECT s_acctbal ,s_name diff --git a/queries/tpch_query20.hql b/queries/tpch_query20.hql index fb819c5..999fe2c 100644 --- a/queries/tpch_query20.hql +++ b/queries/tpch_query20.hql @@ -42,7 +42,7 @@ FROM q20_tmp3_cached WHERE ps_availqty > sum_quantity GROUP BY ps_suppkey; -CREATE TABLE tpch_query20_result AS +--CREATE TABLE tpch_query20_result AS SELECT s_name ,s_address diff --git a/queries/tpch_query21.hql b/queries/tpch_query21.hql index 422f965..18ec1d0 100644 --- a/queries/tpch_query21.hql +++ b/queries/tpch_query21.hql @@ -21,7 +21,7 @@ WHERE l_receiptdate > l_commitdate AND l_orderkey IS NOT NULL GROUP BY l_orderkey; -CREATE TABLE tpch_query21_result AS +--CREATE TABLE tpch_query21_result AS SELECT s_name ,count(1) AS numwait diff --git a/queries/tpch_query22.hql b/queries/tpch_query22.hql index 2526847..3d90975 100644 --- a/queries/tpch_query22.hql +++ b/queries/tpch_query22.hql @@ -29,7 +29,7 @@ CREATE VIEW IF NOT EXISTS q22_orders_tmp_cached AS FROM orders GROUP BY o_custkey; -CREATE TABLE tpch_query22_result AS +--CREATE TABLE tpch_query22_result AS SELECT cntrycode ,count(1) AS numcust diff --git a/queries/tpch_query3.hql b/queries/tpch_query3.hql index 6e14550..932cd41 100644 --- a/queries/tpch_query3.hql +++ b/queries/tpch_query3.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query3_result AS +--CREATE TABLE tpch_query3_result AS SELECT l_orderkey ,sum(l_extendedprice * (1 - l_discount)) AS revenue diff --git a/queries/tpch_query4.hql b/queries/tpch_query4.hql index ac8ba48..ec6f2b7 100644 --- a/queries/tpch_query4.hql +++ b/queries/tpch_query4.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query4_result AS +--CREATE TABLE tpch_query4_result AS SELECT o_orderpriority ,count(*) AS order_count diff --git a/queries/tpch_query5.hql b/queries/tpch_query5.hql index 6379dac..f985d7a 100644 --- a/queries/tpch_query5.hql +++ b/queries/tpch_query5.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query5_result AS +--CREATE TABLE tpch_query5_result AS SELECT n_name ,sum(l_extendedprice * (1 - l_discount)) AS revenue diff --git a/queries/tpch_query6.hql b/queries/tpch_query6.hql index 41a9847..393f1cc 100644 --- a/queries/tpch_query6.hql +++ b/queries/tpch_query6.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query6_result AS +--CREATE TABLE tpch_query6_result AS SELECT sum(l_extendedprice * l_discount) AS revenue FROM lineitem diff --git a/queries/tpch_query7.hql b/queries/tpch_query7.hql index 2656e7d..b3736a3 100644 --- a/queries/tpch_query7.hql +++ b/queries/tpch_query7.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query7_result AS +--CREATE TABLE tpch_query7_result AS SELECT supp_nation ,cust_nation diff --git a/queries/tpch_query8.hql b/queries/tpch_query8.hql index e3807cf..96f66fe 100644 --- a/queries/tpch_query8.hql +++ b/queries/tpch_query8.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query8_result AS +--CREATE TABLE tpch_query8_result AS SELECT o_year ,sum(CASE diff --git a/queries/tpch_query9.hql b/queries/tpch_query9.hql index 79b0d0b..619c7ab 100644 --- a/queries/tpch_query9.hql +++ b/queries/tpch_query9.hql @@ -1,4 +1,4 @@ -CREATE TABLE tpch_query9_result AS +--CREATE TABLE tpch_query9_result AS SELECT nation ,o_year From 0bf68cb8d3502180448d60ae1ec4ffbe073ff0af Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:16:36 +0530 Subject: [PATCH 04/14] Update README.md added header row for the query results output file --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a75f612..c1998b7 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen ```shell beeline -u "jdbc:hive2://`hostname -f`:10001/;transportMode=http" -n "" -p "" -i settings.hql -f ddl/createAllExternalTables.hql -hiveconf LOCATION=/HiveTPCH/ -hiveconf DBNAME=tpch ``` + For HDI 4.0, allow permissions to other users on the storage by running + ```shell + hdfs dfs -chmod -R 777 /HiveTPCH + ``` Generate ORC tables and analyze ```shell beeline -u "jdbc:hive2://`hostname -f`:10001/;transportMode=http" -n "" -p "" -i settings.hql -f ddl/createAllORCTables.hql -hiveconf ORCDBNAME=tpch_orc -hiveconf SOURCE=tpch @@ -78,6 +82,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen If you want to run all the queries 10 times and measure the times it takes, you can use the following command: + echo "Query,run,start_time,end_time,duration" >> times_orc.csv for f in queries/*.sql; do for i in {1..10} ; do STARTTIME="`date +%s`"; beeline -u "jdbc:hive2://`hostname -f`:10001/tpch_orc;transportMode=http" -i settings.hql -f $f > $f.run_$i.out 2>&1 ; ENDTIME="`date +%s`"; echo "$f,$i,$STARTTIME,$ENDTIME,$(($ENDTIME-$STARTTIME))" >> times_orc.csv; done; done; ## FAQ From aa96a99a2081af339673ea7249ce4c824d1c7a35 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:20:56 +0530 Subject: [PATCH 05/14] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c1998b7..5f5dc0b 100644 --- a/README.md +++ b/README.md @@ -80,11 +80,12 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen beeline -u "jdbc:hive2://`hostname -f`:10001/tpch_orc;transportMode=http" -n "" -p "" -i settings.hql -f queries/tpch_query1.hql ``` -If you want to run all the queries 10 times and measure the times it takes, you can use the following command: - +If you want to run all the queries 10 times and measure the times it takes, you can use the following command + echo "Query,run,start_time,end_time,duration" >> times_orc.csv for f in queries/*.sql; do for i in {1..10} ; do STARTTIME="`date +%s`"; beeline -u "jdbc:hive2://`hostname -f`:10001/tpch_orc;transportMode=http" -i settings.hql -f $f > $f.run_$i.out 2>&1 ; ENDTIME="`date +%s`"; echo "$f,$i,$STARTTIME,$ENDTIME,$(($ENDTIME-$STARTTIME))" >> times_orc.csv; done; done; + ## FAQ 1. Does it work with scale factor 1? From 0d4a17ae0e851f8f97dd3359864ec9034281b42d Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:22:02 +0530 Subject: [PATCH 06/14] Update README.md added header row for the results file, minor syntax tweak --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f5dc0b..69c76a8 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ This are set of UDFs and queries that you can use with Hive to use TPCH datagen If you want to run all the queries 10 times and measure the times it takes, you can use the following command - echo "Query,run,start_time,end_time,duration" >> times_orc.csv + echo "Query,run,start_time,end_time,duration" >> times_orc.csv; for f in queries/*.sql; do for i in {1..10} ; do STARTTIME="`date +%s`"; beeline -u "jdbc:hive2://`hostname -f`:10001/tpch_orc;transportMode=http" -i settings.hql -f $f > $f.run_$i.out 2>&1 ; ENDTIME="`date +%s`"; echo "$f,$i,$STARTTIME,$ENDTIME,$(($ENDTIME-$STARTTIME))" >> times_orc.csv; done; done; From b3010eda78416923b12f7d7526f16a3d56d60b16 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:32:44 +0530 Subject: [PATCH 07/14] Update azuredeploy.json Updated default cluster version to 3.6 from 3.5 --- azure/azuredeploy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/azuredeploy.json b/azure/azuredeploy.json index 38500c4..4d79842 100644 --- a/azure/azuredeploy.json +++ b/azure/azuredeploy.json @@ -80,7 +80,7 @@ "apiVersion": "[variables('clusterApiVersion')]", "dependsOn": ["[concat('Microsoft.Storage/storageAccounts/',variables('clusterStorageAccountName'))]"], "properties": { - "clusterVersion": "3.5", + "clusterVersion": "3.6", "osType": "Linux", "tier": "standard", "clusterDefinition": { From 75d675eea74ba58630275e71ab47d3b6d2625235 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 14:52:04 +0530 Subject: [PATCH 08/14] Update azuredeploy.json Fixed URI of json to point to the forked repo --- azure/azuredeploy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/azuredeploy.json b/azure/azuredeploy.json index 4d79842..fedaa96 100644 --- a/azure/azuredeploy.json +++ b/azure/azuredeploy.json @@ -116,7 +116,7 @@ "scriptActions": [ { "name": "TPCH Benchmark", - "uri": "https://raw.githubusercontent.com/dharmeshkakadia/tpch-datagen-as-hive-query/master/azure/TPCH_installer.sh", + "uri": "https://raw.githubusercontent.com/dharmeshkakadia/cruizen/tpch-hdinsight/master/azure/TPCH_installer.sh", "parameters": "[parameters('ScaleFactor')]" } ] From 7f7fd3a796c47ae4243c63fcd1a40c16e7eafaba Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 14:53:18 +0530 Subject: [PATCH 09/14] Update TPCH_installer.sh Update URL of installer.sh to point to the forked repo --- azure/TPCH_installer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/TPCH_installer.sh b/azure/TPCH_installer.sh index 9903c77..564462a 100644 --- a/azure/TPCH_installer.sh +++ b/azure/TPCH_installer.sh @@ -3,7 +3,7 @@ wget -O /tmp/HDInsightUtilities-v01.sh -q https://hdiconfigactions.blob.core.windows.net/linuxconfigactionmodulev01/HDInsightUtilities-v01.sh && source /tmp/HDInsightUtilities-v01.sh && rm -f /tmp/HDInsightUtilities-v01.sh if [[ `hostname -f` == `get_primary_headnode` ]]; then - wget https://github.com/dharmeshkakadia/tpch-datagen-as-hive-query/archive/master.zip + wget https://github.com/cruizen/tpch-hdinsight/archive/master.zip unzip master.zip; cd tpch-datagen-as-hive-query-master; hive -i settings.hql -f TPCHDataGen.hql -hiveconf SCALE=$1 -hiveconf PARTS=$1 -hiveconf LOCATION=/HiveTPCH_$1/ -hiveconf TPCHBIN=resources hive -i settings.hql -f ddl/createAllExternalTables.hql -hiveconf LOCATION=/HiveTPCH_$1/ -hiveconf DBNAME=tpch_$1 From 6bf08fce3123b71fe2ef8063ef3292a07a3489b1 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 14:56:17 +0530 Subject: [PATCH 10/14] Updated URL for 'deploy to azure' in readme.md Update URL for the 'deploy to azure' button --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 69c76a8..1ca60dd 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # tpch-datagen-as-hive-query This are set of UDFs and queries that you can use with Hive to use TPCH datagen in parrellel on hadoop cluster. You can deploy to azure using : - + From d0f2b07988ce8d2de285795d904f1c6b6adc8760 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 15:47:58 +0530 Subject: [PATCH 11/14] Update azuredeploy.json corrected the URI of TPCH_installer.sh --- azure/azuredeploy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/azuredeploy.json b/azure/azuredeploy.json index fedaa96..54895b6 100644 --- a/azure/azuredeploy.json +++ b/azure/azuredeploy.json @@ -116,7 +116,7 @@ "scriptActions": [ { "name": "TPCH Benchmark", - "uri": "https://raw.githubusercontent.com/dharmeshkakadia/cruizen/tpch-hdinsight/master/azure/TPCH_installer.sh", + "uri": "https://raw.githubusercontent.com/cruizen/tpch-hdinsight/master/azure/TPCH_installer.sh", "parameters": "[parameters('ScaleFactor')]" } ] From caa4320c9955d0527d4819ac1cf04333196642d8 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Tue, 17 Sep 2019 18:52:23 +0530 Subject: [PATCH 12/14] Update azuredeploy.parameters.json --- azure/azuredeploy.parameters.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/azuredeploy.parameters.json b/azure/azuredeploy.parameters.json index 2f2f226..774778d 100644 --- a/azure/azuredeploy.parameters.json +++ b/azure/azuredeploy.parameters.json @@ -9,7 +9,7 @@ "value": "hdiuser" }, "loginPassword": { - "value": "changeme" + "value": "Snappy123!!!" } } } From dbfd7748e631f372340be752001c06f4f6ebd2f5 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Wed, 18 Sep 2019 11:38:37 +0530 Subject: [PATCH 13/14] Update createAllORCTables.hql Set 'auto.purge'='true' for the large tables where INSERT OVERWRITE is attempted. --- ddl/createAllORCTables.hql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ddl/createAllORCTables.hql b/ddl/createAllORCTables.hql index 1a32667..4e9b342 100644 --- a/ddl/createAllORCTables.hql +++ b/ddl/createAllORCTables.hql @@ -23,7 +23,7 @@ L_SHIPMODE STRING, L_COMMENT STRING) PARTITIONED BY (L_SHIPDATE STRING) STORED AS ORC -TBLPROPERTIES('orc.bloom.filter.columns'='*','orc.compress'='ZLIB'); +TBLPROPERTIES('orc.bloom.filter.columns'='*','orc.compress'='ZLIB','auto.purge'='true'); INSERT OVERWRITE TABLE lineitem PARTITION(L_SHIPDATE) SELECT @@ -59,7 +59,7 @@ O_SHIPPRIORITY INT, O_COMMENT STRING) PARTITIONED BY (O_ORDERDATE STRING) STORED AS ORC -TBLPROPERTIES('orc.bloom.filter.columns'='*','orc.compress'='ZLIB'); +TBLPROPERTIES('orc.bloom.filter.columns'='*','orc.compress'='ZLIB','auto.purge'='true'); INSERT OVERWRITE TABLE orders PARTITION(O_ORDERDATE) SELECT From 5070a96a3d91cd3335f2dc5f3bd02ab0d9890990 Mon Sep 17 00:00:00 2001 From: Trilok Khairnar <214651+cruizen@users.noreply.github.com> Date: Wed, 18 Sep 2019 11:44:27 +0530 Subject: [PATCH 14/14] Update settings.hql Updated settings file with what we used. (LLAP mode enabled for all queries. Vectorization and LLAP IO cache enabled) --- settings.hql | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/settings.hql b/settings.hql index 2f61006..c98c88d 100644 --- a/settings.hql +++ b/settings.hql @@ -1 +1,13 @@ - SET hive.tez.container.size=2048; \ No newline at end of file +set hive.execution.engine=tez; +set hive.tez.container.size=4096; +set hive.tez.java.opts=-Xmx3800m; +-- set hive.auto.convert.join.noconditionaltask.size=1252698795; +set hive.vectorized.execution.enabled=true; +set hive.execution.mode=llap; +set hive.llap.execution.mode=all; +set hive.llap.io.enabled=true; +set hive.llap.io.memory.mode=cache; + +-- Dynamic partitioning in Hive. We tested with the default value as well as the following turned on. +SET hive.exec.dynamic.partition = true; +SET hive.exec.dynamic.partition.mode = nonstrict;