-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspark2_1_0_kudu.py
46 lines (34 loc) · 1.36 KB
/
spark2_1_0_kudu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Dated . : Aug 26, 2017
# About : Sample Pyspark (2.1.0) code to count number of Rows in a Kudu Table
# Pyspark Version : 2.1.0
# Kudu Version : 1.2.0
# Coder : Ankit Sarraf
import ConfigParser
from pyspark.sql import SparkSession
# from pyspark.sql import SparkContext
# from pyspark import SparkConf
# from pyspark.sql import SQLContext
# Initialize the configuration
#conf = (SparkConf().setMaster("yarn-client").setAppName("KuduSpark_AS"))
spark = SparkSession.builder \
.appName('KuduSpark_2.1') \
.enableHiveSupport() \
.getOrCreate()
configParser = ConfigParser.ConfigParser()
configParser.read('test.conf')
# Determine the kudu_master as provided in the Config File
kudu_master = configParser.get('Kudu', 'KuduMaster')
# Provide the table to be read
kudu_table = configParser.get('Kudu', 'KuduTable')
# Display the data read from the Config File
print('KUDU MASTER: ' + kudu_master)
print('KUDU TABLES: ' + kudu_table)
# Load the data in the memory
kudu_data_df = spark.read.format('org.apache.kudu.spark.kudu'). \
option('kudu.master', kudu_master). \
option('kudu.table', kudu_table). \
load()
# Create or Replace a Temp View in Pyspark for executing sql queries
kudu_data_df.createOrReplaceTempView("tab")
# Count number of rows in the Table
spark.sql("SELECT count(*) FROM tab").show()